summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
committerDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
commiteb11fae6d08f479c0799db45860a98af528fa6e7 (patch)
tree44d492a50c8c1a7eb8e2d17ea3360ec4d066f042 /lib/Target/AMDGPU
parentb8a2042aa938069e862750553db0e4d82d25822c (diff)
Notes
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h27
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td275
-rw-r--r--lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp9
-rw-r--r--lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp108
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp3
-rw-r--r--lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h5
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp580
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.h100
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.cpp122
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.h5
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallingConv.td40
-rw-r--r--lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp404
-rw-r--r--lib/Target/AMDGPU/AMDGPUFeatures.td60
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUGISel.td138
-rw-r--r--lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def76
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp (renamed from lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp)163
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h (renamed from lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h)17
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp236
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp549
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h50
-rw-r--r--lib/Target/AMDGPU/AMDGPUInline.cpp6
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.cpp97
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.h48
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td19
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp315
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.h52
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td174
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp8
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsics.td3
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp164
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.h5
-rw-r--r--lib/Target/AMDGPU/AMDGPULibCalls.cpp83
-rw-r--r--lib/Target/AMDGPU/AMDGPULibFunc.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp3
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp264
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp270
-rw-r--r--lib/Target/AMDGPU/AMDGPUMCInstLower.cpp107
-rw-r--r--lib/Target/AMDGPU/AMDGPUMCInstLower.h46
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp395
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.cpp24
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.h50
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h12
-rw-r--r--lib/Target/AMDGPU/AMDGPUMacroFusion.cpp3
-rw-r--r--lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp81
-rw-r--r--lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp397
-rw-r--r--lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h55
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp102
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp352
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.h19
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBanks.td2
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.cpp16
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.h10
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.td1
-rw-r--r--lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp11
-rw-r--r--lib/Target/AMDGPU/AMDGPUSearchableTables.td77
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp220
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h794
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp102
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.h26
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetObjectFile.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp291
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h82
-rw-r--r--lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp44
-rw-r--r--lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp8
-rw-r--r--lib/Target/AMDGPU/AMDILCFGStructurizer.cpp338
-rw-r--r--lib/Target/AMDGPU/AMDKernelCodeT.h12
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp846
-rw-r--r--lib/Target/AMDGPU/BUFInstructions.td528
-rw-r--r--lib/Target/AMDGPU/CMakeLists.txt46
-rw-r--r--lib/Target/AMDGPU/DSInstructions.td48
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp107
-rw-r--r--lib/Target/AMDGPU/EvergreenInstructions.td3
-rw-r--r--lib/Target/AMDGPU/FLATInstructions.td28
-rw-r--r--lib/Target/AMDGPU/GCNHazardRecognizer.cpp11
-rw-r--r--lib/Target/AMDGPU/GCNHazardRecognizer.h4
-rw-r--r--lib/Target/AMDGPU/GCNILPSched.cpp27
-rw-r--r--lib/Target/AMDGPU/GCNIterativeScheduler.cpp114
-rw-r--r--lib/Target/AMDGPU/GCNMinRegStrategy.cpp51
-rw-r--r--lib/Target/AMDGPU/GCNProcessors.td20
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.cpp32
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.h11
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.cpp122
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.h6
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp155
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h15
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp32
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp12
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp37
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h9
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp36
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h29
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp288
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h34
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp43
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp27
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp34
-rw-r--r--lib/Target/AMDGPU/MIMGInstructions.td1028
-rw-r--r--lib/Target/AMDGPU/Processors.td12
-rw-r--r--lib/Target/AMDGPU/R600.td54
-rw-r--r--lib/Target/AMDGPU/R600AsmPrinter.cpp133
-rw-r--r--lib/Target/AMDGPU/R600AsmPrinter.h46
-rw-r--r--lib/Target/AMDGPU/R600ClauseMergePass.cpp33
-rw-r--r--lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp134
-rw-r--r--lib/Target/AMDGPU/R600Defines.h4
-rw-r--r--lib/Target/AMDGPU/R600EmitClauseMarkers.cpp49
-rw-r--r--lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp64
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp377
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.h5
-rw-r--r--lib/Target/AMDGPU/R600InstrFormats.td6
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.cpp443
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.h34
-rw-r--r--lib/Target/AMDGPU/R600Instructions.td108
-rw-r--r--lib/Target/AMDGPU/R600Intrinsics.td67
-rw-r--r--lib/Target/AMDGPU/R600MachineScheduler.cpp106
-rw-r--r--lib/Target/AMDGPU/R600MachineScheduler.h2
-rw-r--r--lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp (renamed from lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp)14
-rw-r--r--lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp31
-rw-r--r--lib/Target/AMDGPU/R600Packetizer.cpp52
-rw-r--r--lib/Target/AMDGPU/R600Processors.td56
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.cpp65
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.h15
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.td2
-rw-r--r--lib/Target/AMDGPU/SIAnnotateControlFlow.cpp38
-rw-r--r--lib/Target/AMDGPU/SIDebuggerInsertNops.cpp9
-rw-r--r--lib/Target/AMDGPU/SIDefines.h54
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRCopies.cpp114
-rw-r--r--lib/Target/AMDGPU/SIFixVGPRCopies.cpp7
-rw-r--r--lib/Target/AMDGPU/SIFixWWMLiveness.cpp5
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp72
-rw-r--r--lib/Target/AMDGPU/SIFormMemoryClauses.cpp398
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp142
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.h12
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp2200
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h38
-rw-r--r--lib/Target/AMDGPU/SIInsertSkips.cpp59
-rw-r--r--lib/Target/AMDGPU/SIInsertWaitcnts.cpp499
-rw-r--r--lib/Target/AMDGPU/SIInsertWaits.cpp703
-rw-r--r--lib/Target/AMDGPU/SIInstrFormats.td29
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp370
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h80
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td333
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td169
-rw-r--r--lib/Target/AMDGPU/SILoadStoreOptimizer.cpp95
-rw-r--r--lib/Target/AMDGPU/SILowerControlFlow.cpp51
-rw-r--r--lib/Target/AMDGPU/SILowerI1Copies.cpp7
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp90
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h144
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.cpp162
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.h2
-rw-r--r--lib/Target/AMDGPU/SIMemoryLegalizer.cpp1025
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMasking.cpp28
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp14
-rw-r--r--lib/Target/AMDGPU/SIPeepholeSDWA.cpp164
-rw-r--r--lib/Target/AMDGPU/SIProgramInfo.h77
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp125
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h19
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.td28
-rw-r--r--lib/Target/AMDGPU/SISchedule.td2
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp37
-rw-r--r--lib/Target/AMDGPU/SIWholeQuadMode.cpp19
-rw-r--r--lib/Target/AMDGPU/SMInstructions.td271
-rw-r--r--lib/Target/AMDGPU/SOPInstructions.td99
-rw-r--r--lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp6
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp10
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp249
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h111
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp75
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h24
-rw-r--r--lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h1
-rw-r--r--lib/Target/AMDGPU/Utils/CMakeLists.txt1
-rw-r--r--lib/Target/AMDGPU/VOP1Instructions.td74
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td80
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td57
-rw-r--r--lib/Target/AMDGPU/VOP3PInstructions.td173
-rw-r--r--lib/Target/AMDGPU/VOPCInstructions.td5
-rw-r--r--lib/Target/AMDGPU/VOPInstructions.td64
183 files changed, 15348 insertions, 7244 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 0ddc43ad5033..796766d94622 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -11,7 +11,6 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -50,9 +49,9 @@ FunctionPass *createSIOptimizeExecMaskingPreRAPass();
FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIDebuggerInsertNopsPass();
-FunctionPass *createSIInsertWaitsPass();
FunctionPass *createSIInsertWaitcntsPass();
FunctionPass *createSIFixWWMLivenessPass();
+FunctionPass *createSIFormMemoryClausesPass();
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
FunctionPass *createAMDGPUUseNativeCallsPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
@@ -74,6 +73,14 @@ ModulePass *createAMDGPULowerIntrinsicsPass();
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
extern char &AMDGPULowerIntrinsicsID;
+FunctionPass *createAMDGPULowerKernelArgumentsPass();
+void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
+extern char &AMDGPULowerKernelArgumentsID;
+
+ModulePass *createAMDGPULowerKernelAttributesPass();
+void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
+extern char &AMDGPULowerKernelAttributesID;
+
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;
@@ -134,6 +141,9 @@ extern char &AMDGPUSimplifyLibCallsID;
void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
extern char &AMDGPUUseNativeCallsID;
+void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
+extern char &AMDGPUPerfHintAnalysisID;
+
// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
@@ -144,7 +154,7 @@ FunctionPass *createAMDGPUISelDag(
TargetMachine *TM = nullptr,
CodeGenOpt::Level OptLevel = CodeGenOpt::Default);
ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
-ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
+ModulePass *createR600OpenCLImageTypeLoweringPass();
FunctionPass *createAMDGPUAnnotateUniformValues();
ModulePass* createAMDGPUUnifyMetadataPass();
@@ -169,12 +179,12 @@ extern char &SIMemoryLegalizerID;
void initializeSIDebuggerInsertNopsPass(PassRegistry&);
extern char &SIDebuggerInsertNopsID;
-void initializeSIInsertWaitsPass(PassRegistry&);
-extern char &SIInsertWaitsID;
-
void initializeSIInsertWaitcntsPass(PassRegistry&);
extern char &SIInsertWaitcntsID;
+void initializeSIFormMemoryClausesPass(PassRegistry&);
+extern char &SIFormMemoryClausesID;
+
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
extern char &AMDGPUUnifyDivergentExitNodesID;
@@ -222,8 +232,11 @@ struct AMDGPUAS {
MAX_COMMON_ADDRESS = 5,
GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
- CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
+ CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2)
LOCAL_ADDRESS = 3, ///< Address space for local memory.
+
+ CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory
+
/// Address space for direct addressible parameter memory (CONST0)
PARAM_D_ADDRESS = 6,
/// Address space for indirect addressible parameter memory (VTX1)
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index c02d0a131041..16c2a366db28 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -7,58 +7,30 @@
//
//===------------------------------------------------------------===//
+include "llvm/TableGen/SearchableTable.td"
include "llvm/Target/Target.td"
+include "AMDGPUFeatures.td"
//===------------------------------------------------------------===//
// Subtarget Features (device properties)
//===------------------------------------------------------------===//
-def FeatureFP64 : SubtargetFeature<"fp64",
- "FP64",
- "true",
- "Enable double precision operations"
->;
-
-def FeatureFMA : SubtargetFeature<"fmaf",
- "FMA",
- "true",
- "Enable single precision FMA (not as fast as mul+add, but fused)"
->;
-
def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
"FastFMAF32",
"true",
"Assuming f32 fma is at least as fast as mul + add"
>;
-def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
- "HalfRate64Ops",
- "true",
- "Most fp64 instructions are half rate instead of quarter"
->;
-
-def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
- "R600ALUInst",
- "false",
- "Older version of ALU instructions encoding"
->;
-
-def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
- "HasVertexCache",
+def FeatureMIMG_R128 : SubtargetFeature<"mimg-r128",
+ "MIMG_R128",
"true",
- "Specify use of dedicated vertex cache"
+ "Support 128-bit texture resources"
>;
-def FeatureCaymanISA : SubtargetFeature<"caymanISA",
- "CaymanISA",
- "true",
- "Use Cayman ISA"
->;
-
-def FeatureCFALUBug : SubtargetFeature<"cfalubug",
- "CFALUBug",
+def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
+ "HalfRate64Ops",
"true",
- "GPU has CF_ALU bug"
+ "Most fp64 instructions are half rate instead of quarter"
>;
def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
@@ -121,6 +93,12 @@ def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts",
"Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions"
>;
+def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
+ "HasFmaMixInsts",
+ "true",
+ "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
+>;
+
// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
// XNACK. The current default kernel driver setting is:
// - graphics ring: XNACK disabled
@@ -140,27 +118,6 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
"VI SGPR initialization bug requiring a fixed SGPR allocation size"
>;
-class SubtargetFeatureFetchLimit <string Value> :
- SubtargetFeature <"fetch"#Value,
- "TexVTXClauseSize",
- Value,
- "Limit the maximum number of fetches in a clause to "#Value
->;
-
-def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
-def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
-
-class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
- "wavefrontsize"#Value,
- "WavefrontSize",
- !cast<string>(Value),
- "The number of threads per wavefront"
->;
-
-def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
-def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
-def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
-
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
"ldsbankcount"#Value,
"LDSBankCount",
@@ -171,19 +128,6 @@ class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
-class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
- "localmemorysize"#Value,
- "LocalMemorySize",
- !cast<string>(Value),
- "The size of local memory in bytes"
->;
-
-def FeatureGCN : SubtargetFeature<"gcn",
- "IsGCN",
- "true",
- "GCN or newer GPU"
->;
-
def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
"GCN3Encoding",
"true",
@@ -244,6 +188,12 @@ def FeatureScalarStores : SubtargetFeature<"scalar-stores",
"Has store scalar memory instructions"
>;
+def FeatureScalarAtomics : SubtargetFeature<"scalar-atomics",
+ "HasScalarAtomics",
+ "true",
+ "Has atomic scalar memory instructions"
+>;
+
def FeatureSDWA : SubtargetFeature<"sdwa",
"HasSDWA",
"true",
@@ -292,6 +242,27 @@ def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
"Support clamp for integer destination"
>;
+def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem",
+ "HasUnpackedD16VMem",
+ "true",
+ "Has unpacked d16 vmem instructions"
+>;
+
+def FeatureDLInsts : SubtargetFeature<"dl-insts",
+ "HasDLInsts",
+ "true",
+ "Has deep learning instructions"
+>;
+
+def FeatureD16PreservesUnusedBits : SubtargetFeature<
+ "d16-preserves-unused-bits",
+ "D16PreservesUnusedBits",
+ "true",
+ "If present, then instructions defined by HasD16LoadStore predicate preserve "
+ "unused bits. Otherwise instructions defined by HasD16LoadStore predicate "
+ "zero unused bits."
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -329,12 +300,6 @@ def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
[FeatureFP64FP16Denormals]
>;
-def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
- "DX10Clamp",
- "true",
- "clamp modifier clamps NaNs to 0.0"
->;
-
def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
"FPExceptions",
"true",
@@ -377,12 +342,6 @@ def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
"Dump MachineInstrs in the CodeEmitter"
>;
-def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
- "EnablePromoteAlloca",
- "true",
- "Enable promote alloca pass"
->;
-
// XXX - This should probably be removed once enabled by default
def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
"EnableLoadStoreOpt",
@@ -408,6 +367,12 @@ def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
"Enable SI Machine Scheduler"
>;
+def FeatureEnableDS128 : SubtargetFeature<"enable-ds128",
+ "EnableDS128",
+ "true",
+ "Use ds_{read|write}_b128"
+>;
+
// Unless +-flat-for-global is specified, turn on FlatForGlobal for
// all OS-es on VI and newer hardware to avoid assertion failures due
// to missing ADDR64 variants of MUBUF instructions.
@@ -440,46 +405,30 @@ def FeatureDisable : SubtargetFeature<"",
"Dummy feature to disable assembler instructions"
>;
-class SubtargetFeatureGeneration <string Value,
- list<SubtargetFeature> Implies> :
- SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
- Value#" GPU generation", Implies>;
-
-def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
-def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
-def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
-
-def FeatureR600 : SubtargetFeatureGeneration<"R600",
- [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
->;
-
-def FeatureR700 : SubtargetFeatureGeneration<"R700",
- [FeatureFetchLimit16, FeatureLocalMemorySize0]
->;
-
-def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
- [FeatureFetchLimit16, FeatureLocalMemorySize32768]
+def FeatureGCN : SubtargetFeature<"gcn",
+ "IsGCN",
+ "true",
+ "GCN or newer GPU"
>;
-def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
- [FeatureFetchLimit16, FeatureWavefrontSize64,
- FeatureLocalMemorySize32768]
->;
+class GCNSubtargetFeatureGeneration <string Value,
+ list<SubtargetFeature> Implies> :
+ SubtargetFeatureGeneration <Value, "GCNSubtarget", Implies>;
-def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
- [FeatureFP64, FeatureLocalMemorySize32768,
+def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
+ [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureGCN,
FeatureLDSBankCount32, FeatureMovrel]
>;
-def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
- [FeatureFP64, FeatureLocalMemorySize65536,
+def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
+ [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
FeatureCIInsts, FeatureMovrel]
>;
-def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
- [FeatureFP64, FeatureLocalMemorySize65536,
+def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
+ [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
@@ -489,7 +438,7 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
]
>;
-def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
+def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
[FeatureFP64, FeatureLocalMemorySize65536,
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
@@ -498,7 +447,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
- FeatureAddNoCarryInsts
+ FeatureAddNoCarryInsts, FeatureScalarAtomics
]
>;
@@ -534,7 +483,8 @@ def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
[FeatureSeaIslands,
- FeatureLDSBankCount16]>;
+ FeatureLDSBankCount16,
+ FeatureFastFMAF32]>;
def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
[FeatureSeaIslands,
@@ -544,26 +494,24 @@ def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4,
[FeatureSeaIslands,
FeatureLDSBankCount32]>;
-def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0,
- [FeatureVolcanicIslands,
- FeatureLDSBankCount32,
- FeatureSGPRInitBug]>;
-
def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
[FeatureVolcanicIslands,
FeatureFastFMAF32,
HalfRate64Ops,
FeatureLDSBankCount32,
- FeatureXNACK]>;
+ FeatureXNACK,
+ FeatureUnpackedD16VMem]>;
def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
[FeatureVolcanicIslands,
FeatureLDSBankCount32,
- FeatureSGPRInitBug]>;
+ FeatureSGPRInitBug,
+ FeatureUnpackedD16VMem]>;
def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
[FeatureVolcanicIslands,
- FeatureLDSBankCount32]>;
+ FeatureLDSBankCount32,
+ FeatureUnpackedD16VMem]>;
def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
[FeatureVolcanicIslands,
@@ -573,14 +521,28 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
[FeatureGFX9,
FeatureMadMixInsts,
- FeatureLDSBankCount32
- ]>;
+ FeatureLDSBankCount32,
+ FeatureD16PreservesUnusedBits]>;
def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
[FeatureGFX9,
FeatureMadMixInsts,
- FeatureLDSBankCount32
- ]>;
+ FeatureLDSBankCount32,
+ FeatureXNACK,
+ FeatureD16PreservesUnusedBits]>;
+
+def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
+ [FeatureGFX9,
+ FeatureLDSBankCount32,
+ FeatureFmaMixInsts,
+ FeatureD16PreservesUnusedBits]>;
+
+def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
+ [FeatureGFX9,
+ HalfRate64Ops,
+ FeatureFmaMixInsts,
+ FeatureLDSBankCount32,
+ FeatureDLInsts]>;
//===----------------------------------------------------------------------===//
// Debugger related subtarget features.
@@ -593,13 +555,6 @@ def FeatureDebuggerInsertNops : SubtargetFeature<
"Insert one nop instruction for each high level source statement"
>;
-def FeatureDebuggerReserveRegs : SubtargetFeature<
- "amdgpu-debugger-reserve-regs",
- "DebuggerReserveRegs",
- "true",
- "Reserve registers for debugger usage"
->;
-
def FeatureDebuggerEmitPrologue : SubtargetFeature<
"amdgpu-debugger-emit-prologue",
"DebuggerEmitPrologue",
@@ -675,6 +630,7 @@ def AMDGPU : Target {
SDWA9AsmParserVariant,
DPPAsmParserVariant];
let AssemblyWriters = [AMDGPUAsmWriter];
+ let AllowRegisterRenaming = 1;
}
// Dummy Instruction itineraries for pseudo instructions
@@ -685,8 +641,6 @@ def NullALU : InstrItinClass;
// Predicate helper class
//===----------------------------------------------------------------------===//
-def TruePredicate : Predicate<"true">;
-
def isSICI : Predicate<
"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
@@ -715,6 +669,13 @@ def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
AssemblerPredicate<"FeatureGFX9Insts">;
+def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
+ AssemblerPredicate<"FeatureUnpackedD16VMem">;
+def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
+ AssemblerPredicate<"!FeatureUnpackedD16VMem">;
+
+def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">,
+ AssemblerPredicate<"FeatureD16PreservesUnusedBits">;
def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
@@ -733,6 +694,9 @@ def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<"FeatureVOP3P">;
+def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">,
+ AssemblerPredicate<"!FeatureVOP3P">;
+
def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
@@ -748,38 +712,35 @@ def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
AssemblerPredicate<"FeatureMadMixInsts">;
-def EnableLateCFGStructurize : Predicate<
- "EnableLateStructurizeCFG">;
+def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">,
+ AssemblerPredicate<"FeatureScalarAtomics">;
-// Exists to help track down where SubtargetPredicate isn't set rather
-// than letting tablegen crash with an unhelpful error.
-def InvalidPred : Predicate<"predicate not set on instruction or pattern">;
-
-class PredicateControl {
- Predicate SubtargetPredicate = InvalidPred;
- Predicate SIAssemblerPredicate = isSICI;
- Predicate VIAssemblerPredicate = isVI;
- list<Predicate> AssemblerPredicates = [];
- Predicate AssemblerPredicate = TruePredicate;
- list<Predicate> OtherPredicates = [];
- list<Predicate> Predicates = !listconcat([SubtargetPredicate,
- AssemblerPredicate],
- AssemblerPredicates,
- OtherPredicates);
-}
+def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
+def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
+def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
+ AssemblerPredicate<"FeatureVGPRIndexMode">;
+def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
+ AssemblerPredicate<"FeatureMovrel">;
+
+def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
+ AssemblerPredicate<"FeatureFmaMixInsts">;
-class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
- PredicateControl;
+def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
+ AssemblerPredicate<"FeatureDLInsts">;
+def EnableLateCFGStructurize : Predicate<
+ "EnableLateStructurizeCFG">;
+
// Include AMDGPU TD files
-include "R600Schedule.td"
-include "R600Processors.td"
include "SISchedule.td"
include "GCNProcessors.td"
include "AMDGPUInstrInfo.td"
include "AMDGPUIntrinsics.td"
+include "SIIntrinsics.td"
include "AMDGPURegisterInfo.td"
include "AMDGPURegisterBanks.td"
include "AMDGPUInstructions.td"
+include "SIInstrInfo.td"
include "AMDGPUCallingConv.td"
+include "AMDGPUSearchableTables.td"
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 392b011e387c..ef4b69d09d9f 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -61,7 +61,7 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar
/* Region */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}
};
static const AliasResult ASAliasRulesGenIsZero[6][6] = {
- /* Flat Global Constant Group Region Private */
+ /* Flat Global Region Group Constant Private */
/* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
/* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
/* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias},
@@ -72,9 +72,9 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar
assert(AS.MAX_COMMON_ADDRESS <= 5);
if (AS.FLAT_ADDRESS == 0) {
assert(AS.GLOBAL_ADDRESS == 1 &&
- AS.REGION_ADDRESS == 4 &&
+ AS.REGION_ADDRESS == 2 &&
AS.LOCAL_ADDRESS == 3 &&
- AS.CONSTANT_ADDRESS == 2 &&
+ AS.CONSTANT_ADDRESS == 4 &&
AS.PRIVATE_ADDRESS == 5);
ASAliasRules = &ASAliasRulesGenIsZero;
} else {
@@ -115,7 +115,8 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
bool OrLocal) {
const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
- if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) {
+ if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS ||
+ Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS_32BIT) {
return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index c27425443abc..d4bbb2c1eb8d 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -14,6 +14,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -30,13 +33,18 @@ static cl::opt<bool> StressCalls(
class AMDGPUAlwaysInline : public ModulePass {
bool GlobalOpt;
+ void recursivelyVisitUsers(GlobalValue &GV,
+ SmallPtrSetImpl<Function *> &FuncsToAlwaysInline);
public:
static char ID;
AMDGPUAlwaysInline(bool GlobalOpt = false) :
ModulePass(ID), GlobalOpt(GlobalOpt) { }
bool runOnModule(Module &M) override;
- StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
};
} // End anonymous namespace
@@ -46,15 +54,53 @@ INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline",
char AMDGPUAlwaysInline::ID = 0;
+void AMDGPUAlwaysInline::recursivelyVisitUsers(
+ GlobalValue &GV,
+ SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) {
+ SmallVector<User *, 16> Stack;
+
+ SmallPtrSet<const Value *, 8> Visited;
+
+ for (User *U : GV.users())
+ Stack.push_back(U);
+
+ while (!Stack.empty()) {
+ User *U = Stack.pop_back_val();
+ if (!Visited.insert(U).second)
+ continue;
+
+ if (Instruction *I = dyn_cast<Instruction>(U)) {
+ Function *F = I->getParent()->getParent();
+ if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+ FuncsToAlwaysInline.insert(F);
+ Stack.push_back(F);
+ }
+
+ // No need to look at further users, but we do need to inline any callers.
+ continue;
+ }
+
+ for (User *UU : U->users())
+ Stack.push_back(UU);
+ }
+}
+
bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+ AMDGPUAS AMDGPUAS = AMDGPU::getAMDGPUAS(M);
+
std::vector<GlobalAlias*> AliasesToRemove;
- std::vector<Function *> FuncsToClone;
+
+ SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
+ SmallPtrSet<Function *, 8> FuncsToNoInline;
for (GlobalAlias &A : M.aliases()) {
if (Function* F = dyn_cast<Function>(A.getAliasee())) {
A.replaceAllUsesWith(F);
AliasesToRemove.push_back(&A);
}
+
+ // FIXME: If the aliasee isn't a function, it's some kind of constant expr
+ // cast that won't be inlined through.
}
if (GlobalOpt) {
@@ -63,31 +109,51 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
}
}
- auto NewAttr = StressCalls ? Attribute::NoInline : Attribute::AlwaysInline;
- auto IncompatAttr
- = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
-
- for (Function &F : M) {
- if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
- !F.hasFnAttribute(IncompatAttr))
- FuncsToClone.push_back(&F);
- }
-
- for (Function *F : FuncsToClone) {
- ValueToValueMapTy VMap;
- Function *NewFunc = CloneFunction(F, VMap);
- NewFunc->setLinkage(GlobalValue::InternalLinkage);
- F->replaceAllUsesWith(NewFunc);
+ // Always force inlining of any function that uses an LDS global address. This
+ // is something of a workaround because we don't have a way of supporting LDS
+ // objects defined in functions. LDS is always allocated by a kernel, and it
+ // is difficult to manage LDS usage if a function may be used by multiple
+ // kernels.
+ //
+ // OpenCL doesn't allow declaring LDS in non-kernels, so in practice this
+ // should only appear when IPO passes manages to move LDs defined in a kernel
+ // into a single user function.
+
+ for (GlobalVariable &GV : M.globals()) {
+ // TODO: Region address
+ unsigned AS = GV.getType()->getAddressSpace();
+ if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS.REGION_ADDRESS)
+ continue;
+
+ recursivelyVisitUsers(GV, FuncsToAlwaysInline);
}
- for (Function &F : M) {
- if (F.hasLocalLinkage() && !F.hasFnAttribute(IncompatAttr)) {
- F.addFnAttr(NewAttr);
+ if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
+ auto IncompatAttr
+ = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
+
+ for (Function &F : M) {
+ if (!F.isDeclaration() && !F.use_empty() &&
+ !F.hasFnAttribute(IncompatAttr)) {
+ if (StressCalls) {
+ if (!FuncsToAlwaysInline.count(&F))
+ FuncsToNoInline.insert(&F);
+ } else
+ FuncsToAlwaysInline.insert(&F);
+ }
}
}
- return false;
+
+ for (Function *F : FuncsToAlwaysInline)
+ F->addFnAttr(Attribute::AlwaysInline);
+
+ for (Function *F : FuncsToNoInline)
+ F->addFnAttr(Attribute::NoInline);
+
+ return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty();
}
ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) {
return new AMDGPUAlwaysInline(GlobalOpt);
}
+
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index ce17202f3414..1a70833a4472 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -219,7 +219,7 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
}
bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
bool HasFlat = ST.hasFlatAddressSpace();
bool HasApertureRegs = ST.hasApertureRegs();
SmallPtrSet<const Constant *, 8> ConstantExprVisited;
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index dcca3a2fab96..7465cf22b5a4 100644
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -55,9 +55,6 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
<< " DispatchID: " << FI.second.DispatchID
<< " FlatScratchInit: " << FI.second.FlatScratchInit
<< " PrivateSegmentSize: " << FI.second.PrivateSegmentSize
- << " GridWorkgroupCountX: " << FI.second.GridWorkGroupCountX
- << " GridWorkgroupCountY: " << FI.second.GridWorkGroupCountY
- << " GridWorkgroupCountZ: " << FI.second.GridWorkGroupCountZ
<< " WorkGroupIDX: " << FI.second.WorkGroupIDX
<< " WorkGroupIDY: " << FI.second.WorkGroupIDY
<< " WorkGroupIDZ: " << FI.second.WorkGroupIDZ
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index bf9635549a8c..f0e6d1b83f15 100644
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -18,7 +18,7 @@ namespace llvm {
class Function;
class raw_ostream;
-class SISubtarget;
+class GCNSubtarget;
class TargetMachine;
class TargetRegisterClass;
class TargetRegisterInfo;
@@ -111,9 +111,6 @@ struct AMDGPUFunctionArgInfo {
ArgDescriptor DispatchID;
ArgDescriptor FlatScratchInit;
ArgDescriptor PrivateSegmentSize;
- ArgDescriptor GridWorkGroupCountX;
- ArgDescriptor GridWorkGroupCountY;
- ArgDescriptor GridWorkGroupCountZ;
// System SGPRs in kernels.
ArgDescriptor WorkGroupIDX;
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index fda6252f46e3..e62e5d52ad74 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===//
+//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer -------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -21,7 +21,9 @@
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
+#include "R600AsmPrinter.h"
#include "R600Defines.h"
#include "R600MachineFunctionInfo.h"
#include "R600RegisterInfo.h"
@@ -32,7 +34,6 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
@@ -40,6 +41,7 @@
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -65,7 +67,7 @@ using namespace llvm::AMDGPU;
// instructions to run at the double precision rate for the device so it's
// probably best to just report no single precision denormals.
static uint32_t getFPMode(const MachineFunction &F) {
- const SISubtarget& ST = F.getSubtarget<SISubtarget>();
+ const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>();
// TODO: Is there any real use for the flush in only / flush out only modes?
uint32_t FP32Denormals =
@@ -88,7 +90,7 @@ createAMDGPUAsmPrinterPass(TargetMachine &tm,
extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
- createAMDGPUAsmPrinterPass);
+ llvm::createR600AsmPrinterPass);
TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
createAMDGPUAsmPrinterPass);
}
@@ -114,7 +116,8 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
}
void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
- if (TM.getTargetTriple().getArch() != Triple::amdgcn)
+ if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+ TM.getTargetTriple().getOS() == Triple::AMDHSA)
return;
if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
@@ -127,10 +130,6 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
readPALMetadata(M);
- // Deprecated notes are not emitted for code object v3.
- if (IsaInfo::hasCodeObjectV3(getSTI()->getFeatureBits()))
- return;
-
// HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
@@ -142,7 +141,9 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
}
void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
- if (TM.getTargetTriple().getArch() != Triple::amdgcn)
+ // TODO: Add metadata to code object v3.
+ if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+ TM.getTargetTriple().getOS() == Triple::AMDHSA)
return;
// Following code requires TargetStreamer to be present.
@@ -189,37 +190,82 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
}
void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
- const AMDGPUMachineFunction *MFI = MF->getInfo<AMDGPUMachineFunction>();
- if (!MFI->isEntryFunction())
+ const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+ if (!MFI.isEntryFunction())
+ return;
+ if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+ TM.getTargetTriple().getOS() == Triple::AMDHSA)
return;
- const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
- amd_kernel_code_t KernelCode;
- if (STM.isAmdCodeObjectV2(*MF)) {
+ const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
+ const Function &F = MF->getFunction();
+ if (STM.isAmdCodeObjectV2(F) &&
+ (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+ F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
+ amd_kernel_code_t KernelCode;
getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
-
- OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
}
if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
return;
- HSAMetadataStream.emitKernel(MF->getFunction(),
- getHSACodeProps(*MF, CurrentProgramInfo),
- getHSADebugProps(*MF, CurrentProgramInfo));
+ HSAMetadataStream.emitKernel(*MF, CurrentProgramInfo);
+}
+
+void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
+ const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+ if (!MFI.isEntryFunction())
+ return;
+ if (!IsaInfo::hasCodeObjectV3(getSTI()) ||
+ TM.getTargetTriple().getOS() != Triple::AMDHSA)
+ return;
+
+ auto &Streamer = getTargetStreamer()->getStreamer();
+ auto &Context = Streamer.getContext();
+ auto &ObjectFileInfo = *Context.getObjectFileInfo();
+ auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
+
+ Streamer.PushSection();
+ Streamer.SwitchSection(&ReadOnlySection);
+
+ // CP microcode requires the kernel descriptor to be allocated on 64 byte
+ // alignment.
+ Streamer.EmitValueToAlignment(64, 0, 1, 0);
+ if (ReadOnlySection.getAlignment() < 64)
+ ReadOnlySection.setAlignment(64);
+
+ SmallString<128> KernelName;
+ getNameWithPrefix(KernelName, &MF->getFunction());
+ getTargetStreamer()->EmitAmdhsaKernelDescriptor(
+ *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
+ CurrentProgramInfo.NumVGPRsForWavesPerEU,
+ CurrentProgramInfo.NumSGPRsForWavesPerEU -
+ IsaInfo::getNumExtraSGPRs(getSTI()->getFeatureBits(),
+ CurrentProgramInfo.VCCUsed,
+ CurrentProgramInfo.FlatUsed),
+ CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+ hasXNACK(*getSTI()));
+
+ Streamer.PopSection();
}
void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
+ if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+ TM.getTargetTriple().getOS() == Triple::AMDHSA) {
+ AsmPrinter::EmitFunctionEntryLabel();
+ return;
+ }
+
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
- if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) {
+ const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
+ if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(MF->getFunction())) {
SmallString<128> SymbolName;
getNameWithPrefix(SymbolName, &MF->getFunction()),
getTargetStreamer()->EmitAMDGPUSymbolType(
SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
}
- const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+ const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
if (STI.dumpCode()) {
// Disassemble function name label to text.
DisasmLines.push_back(MF->getName().str() + ":");
@@ -231,7 +277,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
}
void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
- const AMDGPUSubtarget &STI = MBB.getParent()->getSubtarget<AMDGPUSubtarget>();
+ const GCNSubtarget &STI = MBB.getParent()->getSubtarget<GCNSubtarget>();
if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) {
// Write a line for the basic block label if it is not only fallthrough.
DisasmLines.push_back(
@@ -283,11 +329,66 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments(
uint32_t NumVGPR,
uint32_t NumSGPR,
uint64_t ScratchSize,
- uint64_t CodeSize) {
+ uint64_t CodeSize,
+ const AMDGPUMachineFunction *MFI) {
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
+ OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
+ false);
+}
+
+uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
+ const MachineFunction &MF) const {
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ uint16_t KernelCodeProperties = 0;
+
+ if (MFI.hasPrivateSegmentBuffer()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+ }
+ if (MFI.hasDispatchPtr()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+ }
+ if (MFI.hasQueuePtr()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+ }
+ if (MFI.hasKernargSegmentPtr()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+ }
+ if (MFI.hasDispatchID()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+ }
+ if (MFI.hasFlatScratchInit()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+ }
+
+ return KernelCodeProperties;
+}
+
+amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
+ const MachineFunction &MF,
+ const SIProgramInfo &PI) const {
+ amdhsa::kernel_descriptor_t KernelDescriptor;
+ memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
+
+ assert(isUInt<32>(PI.ScratchSize));
+ assert(isUInt<32>(PI.ComputePGMRSrc1));
+ assert(isUInt<32>(PI.ComputePGMRSrc2));
+
+ KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
+ KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
+ KernelDescriptor.compute_pgm_rsrc1 = PI.ComputePGMRSrc1;
+ KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
+ KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
+
+ return KernelDescriptor;
}
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -301,32 +402,29 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
SetupMachineFunction(MF);
- const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
MCContext &Context = getObjFileLowering().getContext();
- if (!STM.isAmdHsaOS()) {
+ // FIXME: This should be an explicit check for Mesa.
+ if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
MCSectionELF *ConfigSection =
Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
OutStreamer->SwitchSection(ConfigSection);
}
- if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- if (MFI->isEntryFunction()) {
- getSIProgramInfo(CurrentProgramInfo, MF);
- } else {
- auto I = CallGraphResourceInfo.insert(
- std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
- SIFunctionResourceInfo &Info = I.first->second;
- assert(I.second && "should only be called once per function");
- Info = analyzeResourceUsage(MF);
- }
-
- if (STM.isAmdPalOS())
- EmitPALMetadata(MF, CurrentProgramInfo);
- if (!STM.isAmdHsaOS()) {
- EmitProgramInfoSI(MF, CurrentProgramInfo);
- }
+ if (MFI->isEntryFunction()) {
+ getSIProgramInfo(CurrentProgramInfo, MF);
} else {
- EmitProgramInfoR600(MF);
+ auto I = CallGraphResourceInfo.insert(
+ std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
+ SIFunctionResourceInfo &Info = I.first->second;
+ assert(I.second && "should only be called once per function");
+ Info = analyzeResourceUsage(MF);
+ }
+
+ if (STM.isAmdPalOS())
+ EmitPALMetadata(MF, CurrentProgramInfo);
+ else if (!STM.isAmdHsaOS()) {
+ EmitProgramInfoSI(MF, CurrentProgramInfo);
}
DisasmLines.clear();
@@ -340,84 +438,74 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
OutStreamer->SwitchSection(CommentSection);
- if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- if (!MFI->isEntryFunction()) {
- OutStreamer->emitRawComment(" Function info:", false);
- SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
- emitCommonFunctionComments(
- Info.NumVGPR,
- Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()),
- Info.PrivateSegmentSize,
- getFunctionCodeSize(MF));
- return false;
- }
-
- OutStreamer->emitRawComment(" Kernel info:", false);
- emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
- CurrentProgramInfo.NumSGPR,
- CurrentProgramInfo.ScratchSize,
- getFunctionCodeSize(MF));
-
- OutStreamer->emitRawComment(
- " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
- OutStreamer->emitRawComment(
- " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
- OutStreamer->emitRawComment(
- " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
- " bytes/workgroup (compile time only)", false);
-
- OutStreamer->emitRawComment(
- " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
- OutStreamer->emitRawComment(
- " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
-
- OutStreamer->emitRawComment(
- " NumSGPRsForWavesPerEU: " +
- Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
- OutStreamer->emitRawComment(
- " NumVGPRsForWavesPerEU: " +
- Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
-
- OutStreamer->emitRawComment(
- " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst),
- false);
- OutStreamer->emitRawComment(
- " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount),
- false);
-
- if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
- OutStreamer->emitRawComment(
- " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
- Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
- OutStreamer->emitRawComment(
- " DebuggerPrivateSegmentBufferSGPR: s" +
- Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
- }
+ if (!MFI->isEntryFunction()) {
+ OutStreamer->emitRawComment(" Function info:", false);
+ SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
+ emitCommonFunctionComments(
+ Info.NumVGPR,
+ Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
+ Info.PrivateSegmentSize,
+ getFunctionCodeSize(MF), MFI);
+ return false;
+ }
+ OutStreamer->emitRawComment(" Kernel info:", false);
+ emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
+ CurrentProgramInfo.NumSGPR,
+ CurrentProgramInfo.ScratchSize,
+ getFunctionCodeSize(MF), MFI);
+
+ OutStreamer->emitRawComment(
+ " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
+ OutStreamer->emitRawComment(
+ " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
+ OutStreamer->emitRawComment(
+ " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
+ " bytes/workgroup (compile time only)", false);
+
+ OutStreamer->emitRawComment(
+ " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
+ OutStreamer->emitRawComment(
+ " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
+
+ OutStreamer->emitRawComment(
+ " NumSGPRsForWavesPerEU: " +
+ Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
+ OutStreamer->emitRawComment(
+ " NumVGPRsForWavesPerEU: " +
+ Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
+
+ OutStreamer->emitRawComment(
+ " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
+
+ if (MF.getSubtarget<GCNSubtarget>().debuggerEmitPrologue()) {
OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:USER_SGPR: " +
- Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
- Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
+ Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
- Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
- Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
- Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
- Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
- false);
- } else {
- R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
- OutStreamer->emitRawComment(
- Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize)));
+ " DebuggerPrivateSegmentBufferSGPR: s" +
+ Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
}
+
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:USER_SGPR: " +
+ Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
+ Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+ Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+ Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+ Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+ Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
+ false);
}
if (STM.dumpCode()) {
@@ -440,67 +528,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
return false;
}
-void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
- unsigned MaxGPR = 0;
- bool killPixel = false;
- const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
- const R600RegisterInfo *RI = STM.getRegisterInfo();
- const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-
- for (const MachineBasicBlock &MBB : MF) {
- for (const MachineInstr &MI : MBB) {
- if (MI.getOpcode() == AMDGPU::KILLGT)
- killPixel = true;
- unsigned numOperands = MI.getNumOperands();
- for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
- const MachineOperand &MO = MI.getOperand(op_idx);
- if (!MO.isReg())
- continue;
- unsigned HWReg = RI->getHWRegIndex(MO.getReg());
-
- // Register with value > 127 aren't GPR
- if (HWReg > 127)
- continue;
- MaxGPR = std::max(MaxGPR, HWReg);
- }
- }
- }
-
- unsigned RsrcReg;
- if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
- // Evergreen / Northern Islands
- switch (MF.getFunction().getCallingConv()) {
- default: LLVM_FALLTHROUGH;
- case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
- case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
- case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
- case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
- }
- } else {
- // R600 / R700
- switch (MF.getFunction().getCallingConv()) {
- default: LLVM_FALLTHROUGH;
- case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
- case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
- case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
- case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
- }
- }
-
- OutStreamer->EmitIntValue(RsrcReg, 4);
- OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
- S_STACK_SIZE(MFI->CFStackSize), 4);
- OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
- OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
-
- if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
- OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
- OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
- }
-}
-
uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = STM.getInstrInfo();
uint64_t CodeSize = 0;
@@ -510,7 +539,7 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const
// TODO: CodeSize should account for multiple functions.
// TODO: Should we count size of debug info?
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
continue;
CodeSize += TII->getInstSizeInBytes(MI);
@@ -531,30 +560,10 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
return false;
}
-static unsigned getNumExtraSGPRs(const SISubtarget &ST,
- bool VCCUsed,
- bool FlatScrUsed) {
- unsigned ExtraSGPRs = 0;
- if (VCCUsed)
- ExtraSGPRs = 2;
-
- if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
- if (FlatScrUsed)
- ExtraSGPRs = 4;
- } else {
- if (ST.isXNACKEnabled())
- ExtraSGPRs = 4;
-
- if (FlatScrUsed)
- ExtraSGPRs = 6;
- }
-
- return ExtraSGPRs;
-}
-
int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
- const SISubtarget &ST) const {
- return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch);
+ const GCNSubtarget &ST) const {
+ return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(),
+ UsesVCC, UsesFlatScratch);
}
AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
@@ -562,7 +571,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
SIFunctionResourceInfo Info;
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
const MachineRegisterInfo &MRI = MF.getRegInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -586,6 +595,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
Info.PrivateSegmentSize = FrameInfo.getStackSize();
+ if (MFI->isStackRealigned())
+ Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
@@ -649,7 +660,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
continue;
case AMDGPU::NoRegister:
- assert(MI.isDebugValue());
+ assert(MI.isDebugInstr());
continue;
case AMDGPU::VCC:
@@ -663,6 +674,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
case AMDGPU::FLAT_SCR_HI:
continue;
+ case AMDGPU::XNACK_MASK:
+ case AMDGPU::XNACK_MASK_LO:
+ case AMDGPU::XNACK_MASK_HI:
+ llvm_unreachable("xnack_mask registers should not be used");
+
case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
@@ -742,8 +758,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
// conservative guesses.
// 48 SGPRs - vcc, - flat_scr, -xnack
- int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true,
- ST.hasFlatAddressSpace());
+ int MaxSGPRGuess =
+ 47 - IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(), true,
+ ST.hasFlatAddressSpace());
MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
MaxVGPR = std::max(MaxVGPR, 23);
@@ -798,15 +815,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
MF.getFunction().getContext().diagnose(DiagStackSize);
}
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const SIInstrInfo *TII = STM.getInstrInfo();
const SIRegisterInfo *RI = &TII->getRegisterInfo();
- unsigned ExtraSGPRs = getNumExtraSGPRs(STM,
- ProgInfo.VCCUsed,
- ProgInfo.FlatUsed);
- unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF);
+ // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
+ // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
+ // unified.
+ unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
+ STM.getFeatureBits(), ProgInfo.VCCUsed, ProgInfo.FlatUsed);
// Check the addressable register limit before we add ExtraSGPRs.
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
@@ -827,7 +845,19 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// Account for extra SGPRs and VGPRs reserved for debugger use.
ProgInfo.NumSGPR += ExtraSGPRs;
- ProgInfo.NumVGPR += ExtraVGPRs;
+
+ // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
+ // dispatch registers are function args.
+ unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
+ for (auto &Arg : MF.getFunction().args()) {
+ unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32;
+ if (Arg.hasAttribute(Attribute::InReg))
+ WaveDispatchNumSGPR += NumRegs;
+ else
+ WaveDispatchNumVGPR += NumRegs;
+ }
+ ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
+ ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
// Adjust number of registers used to meet default/requested minimum/maximum
// number of waves per execution unit request.
@@ -875,19 +905,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
Ctx.diagnose(Diag);
}
- // SGPRBlocks is actual number of SGPR blocks minus 1.
- ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
- STM.getSGPREncodingGranule());
- ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / STM.getSGPREncodingGranule() - 1;
-
- // VGPRBlocks is actual number of VGPR blocks minus 1.
- ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
- STM.getVGPREncodingGranule());
- ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1;
-
- // Record first reserved VGPR and number of reserved VGPRs.
- ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? ProgInfo.NumVGPR : 0;
- ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF);
+ ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
+ STM.getFeatureBits(), ProgInfo.NumSGPRsForWavesPerEU);
+ ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
+ STM.getFeatureBits(), ProgInfo.NumVGPRsForWavesPerEU);
// Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
// DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
@@ -909,7 +930,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.DX10Clamp = STM.enableDX10Clamp();
unsigned LDSAlignShift;
- if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
+ if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
// LDS is allocated in 64 dword blocks.
LDSAlignShift = 8;
} else {
@@ -954,7 +975,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.ComputePGMRSrc2 =
S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
- S_00B84C_TRAP_HANDLER(STM.isTrapHandlerEnabled()) |
+ // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
+ S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
@@ -981,7 +1003,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
const SIProgramInfo &CurrentProgramInfo) {
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
@@ -1002,26 +1024,21 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->EmitIntValue(RsrcReg, 4);
OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
- unsigned Rsrc2Val = 0;
if (STM.isVGPRSpillingEnabled(MF.getFunction())) {
OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
- if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
- Rsrc2Val = S_00B84C_SCRATCH_EN(CurrentProgramInfo.ScratchBlocks > 0);
- }
- if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
- OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
- OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
- OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
- OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
- Rsrc2Val |= S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks);
- }
- if (Rsrc2Val) {
- OutStreamer->EmitIntValue(RsrcReg + 4 /*rsrc2*/, 4);
- OutStreamer->EmitIntValue(Rsrc2Val, 4);
}
}
+ if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
+ OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
+ OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
+ OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
+ OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
+ OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
+ OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
+ }
+
OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
@@ -1114,8 +1131,12 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
const SIProgramInfo &CurrentProgramInfo,
const MachineFunction &MF) const {
+ const Function &F = MF.getFunction();
+ assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+ F.getCallingConv() == CallingConv::SPIR_KERNEL);
+
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
@@ -1151,21 +1172,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (MFI->hasFlatScratchInit())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
- if (MFI->hasGridWorkgroupCountX()) {
- Out.code_properties |=
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
- }
-
- if (MFI->hasGridWorkgroupCountY()) {
- Out.code_properties |=
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
- }
-
- if (MFI->hasGridWorkgroupCountZ()) {
- Out.code_properties |=
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
- }
-
if (MFI->hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
@@ -1175,20 +1181,17 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (STM.isXNACKEnabled())
Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
- // FIXME: Should use getKernArgSize
- Out.kernarg_segment_byte_size =
- STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
+ unsigned MaxKernArgAlign;
+ Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
- Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst;
- Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount;
// These alignment values are specified in powers of two, so alignment =
// 2^n. The minimum alignment is 2^4 = 16.
Out.kernarg_segment_alignment = std::max((size_t)4,
- countTrailingZeros(MFI->getMaxKernArgAlign()));
+ countTrailingZeros(MaxKernArgAlign));
if (STM.debuggerEmitPrologue()) {
Out.debug_wavefront_private_segment_offset_sgpr =
@@ -1198,55 +1201,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
}
}
-AMDGPU::HSAMD::Kernel::CodeProps::Metadata AMDGPUAsmPrinter::getHSACodeProps(
- const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const {
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
- const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
- HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
-
- HSACodeProps.mKernargSegmentSize =
- STM.getKernArgSegmentSize(MF, MFI.getABIArgOffset());
- HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
- HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
- HSACodeProps.mKernargSegmentAlign =
- std::max(uint32_t(4), MFI.getMaxKernArgAlign());
- HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
- HSACodeProps.mNumSGPRs = CurrentProgramInfo.NumSGPR;
- HSACodeProps.mNumVGPRs = CurrentProgramInfo.NumVGPR;
- HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize();
- HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack;
- HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled();
- HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs();
- HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs();
-
- return HSACodeProps;
-}
-
-AMDGPU::HSAMD::Kernel::DebugProps::Metadata AMDGPUAsmPrinter::getHSADebugProps(
- const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const {
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
- HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
-
- if (!STM.debuggerSupported())
- return HSADebugProps;
-
- HSADebugProps.mDebuggerABIVersion.push_back(1);
- HSADebugProps.mDebuggerABIVersion.push_back(0);
- HSADebugProps.mReservedNumVGPRs = ProgramInfo.ReservedVGPRCount;
- HSADebugProps.mReservedFirstVGPR = ProgramInfo.ReservedVGPRFirst;
-
- if (STM.debuggerEmitPrologue()) {
- HSADebugProps.mPrivateSegmentBufferSGPR =
- ProgramInfo.DebuggerPrivateSegmentBufferSGPR;
- HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR =
- ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
- }
-
- return HSADebugProps;
-}
-
bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
unsigned AsmVariant,
const char *ExtraCode, raw_ostream &O) {
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 51d48a0c7320..22982d912c70 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU Assembly printer class.
+/// AMDGPU Assembly printer class.
//
//===----------------------------------------------------------------------===//
@@ -17,9 +17,11 @@
#include "AMDGPU.h"
#include "AMDKernelCodeT.h"
-#include "MCTargetDesc/AMDGPUHSAMetadataStreamer.h"
+#include "AMDGPUHSAMetadataStreamer.h"
+#include "SIProgramInfo.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include <cstddef>
#include <cstdint>
#include <limits>
@@ -29,9 +31,10 @@
namespace llvm {
+class AMDGPUMachineFunction;
class AMDGPUTargetStreamer;
class MCOperand;
-class SISubtarget;
+class GCNSubtarget;
class AMDGPUAsmPrinter final : public AsmPrinter {
private:
@@ -47,68 +50,7 @@ private:
bool HasDynamicallySizedStack = false;
bool HasRecursion = false;
- int32_t getTotalNumSGPRs(const SISubtarget &ST) const;
- };
-
- // Track resource usage for kernels / entry functions.
- struct SIProgramInfo {
- // Fields set in PGM_RSRC1 pm4 packet.
- uint32_t VGPRBlocks = 0;
- uint32_t SGPRBlocks = 0;
- uint32_t Priority = 0;
- uint32_t FloatMode = 0;
- uint32_t Priv = 0;
- uint32_t DX10Clamp = 0;
- uint32_t DebugMode = 0;
- uint32_t IEEEMode = 0;
- uint64_t ScratchSize = 0;
-
- uint64_t ComputePGMRSrc1 = 0;
-
- // Fields set in PGM_RSRC2 pm4 packet.
- uint32_t LDSBlocks = 0;
- uint32_t ScratchBlocks = 0;
-
- uint64_t ComputePGMRSrc2 = 0;
-
- uint32_t NumVGPR = 0;
- uint32_t NumSGPR = 0;
- uint32_t LDSSize = 0;
- bool FlatUsed = false;
-
- // Number of SGPRs that meets number of waves per execution unit request.
- uint32_t NumSGPRsForWavesPerEU = 0;
-
- // Number of VGPRs that meets number of waves per execution unit request.
- uint32_t NumVGPRsForWavesPerEU = 0;
-
- // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first
- // fixed VGPR number reserved.
- uint16_t ReservedVGPRFirst = 0;
-
- // The number of consecutive VGPRs reserved.
- uint16_t ReservedVGPRCount = 0;
-
- // Fixed SGPR number used to hold wave scratch offset for entire kernel
- // execution, or std::numeric_limits<uint16_t>::max() if the register is not
- // used or not known.
- uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR =
- std::numeric_limits<uint16_t>::max();
-
- // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
- // kernel execution, or std::numeric_limits<uint16_t>::max() if the register
- // is not used or not known.
- uint16_t DebuggerPrivateSegmentBufferSGPR =
- std::numeric_limits<uint16_t>::max();
-
- // Whether there is recursion, dynamic allocas, indirect calls or some other
- // reason there may be statically unknown stack usage.
- bool DynamicCallStack = false;
-
- // Bonus information for debugging.
- bool VCCUsed = false;
-
- SIProgramInfo() = default;
+ int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
};
SIProgramInfo CurrentProgramInfo;
@@ -128,16 +70,8 @@ private:
unsigned &NumSGPR,
unsigned &NumVGPR) const;
- AMDGPU::HSAMD::Kernel::CodeProps::Metadata getHSACodeProps(
- const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const;
- AMDGPU::HSAMD::Kernel::DebugProps::Metadata getHSADebugProps(
- const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const;
-
- /// \brief Emit register usage information so that the GPU driver
+ /// Emit register usage information so that the GPU driver
/// can correctly setup the GPU state.
- void EmitProgramInfoR600(const MachineFunction &MF);
void EmitProgramInfoSI(const MachineFunction &MF,
const SIProgramInfo &KernelInfo);
void EmitPALMetadata(const MachineFunction &MF,
@@ -145,7 +79,15 @@ private:
void emitCommonFunctionComments(uint32_t NumVGPR,
uint32_t NumSGPR,
uint64_t ScratchSize,
- uint64_t CodeSize);
+ uint64_t CodeSize,
+ const AMDGPUMachineFunction* MFI);
+
+ uint16_t getAmdhsaKernelCodeProperties(
+ const MachineFunction &MF) const;
+
+ amdhsa::kernel_descriptor_t getAmdhsaKernelDescriptor(
+ const MachineFunction &MF,
+ const SIProgramInfo &PI) const;
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
@@ -160,16 +102,16 @@ public:
bool doFinalization(Module &M) override;
bool runOnMachineFunction(MachineFunction &MF) override;
- /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
+ /// Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
/// pseudo lowering.
bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
- /// \brief Lower the specified LLVM Constant to an MCExpr.
+ /// Lower the specified LLVM Constant to an MCExpr.
/// The AsmPrinter::lowerConstantof does not know how to lower
/// addrspacecast, therefore they should be lowered by this function.
const MCExpr *lowerConstant(const Constant *CV) override;
- /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo
+ /// tblgen'erated driver function for lowering simple MI->MC pseudo
/// instructions.
bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
const MachineInstr *MI);
@@ -179,6 +121,8 @@ public:
void EmitFunctionBodyStart() override;
+ void EmitFunctionBodyEnd() override;
+
void EmitFunctionEntryLabel() override;
void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 5a9138731934..18c7df0d94f2 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -20,6 +20,7 @@
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -32,13 +33,17 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
const Value *Val, unsigned VReg) const {
+ // FIXME: Add support for non-void returns.
+ if (Val)
+ return false;
+
MIRBuilder.buildInstr(AMDGPU::S_ENDPGM);
return true;
}
unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
Type *ParamTy,
- unsigned Offset) const {
+ uint64_t Offset) const {
MachineFunction &MF = MIRBuilder.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -61,7 +66,8 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
}
void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
- Type *ParamTy, unsigned Offset,
+ Type *ParamTy, uint64_t Offset,
+ unsigned Align,
unsigned DstReg) const {
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
@@ -69,7 +75,6 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
- unsigned Align = DL.getABITypeAlignment(ParamTy);
unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
MachineMemOperand *MMO =
@@ -84,12 +89,16 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
const Function &F,
ArrayRef<unsigned> VRegs) const {
+ // AMDGPU_GS and AMDGP_HS are not supported yet.
+ if (F.getCallingConv() == CallingConv::AMDGPU_GS ||
+ F.getCallingConv() == CallingConv::AMDGPU_HS)
+ return false;
MachineFunction &MF = MIRBuilder.getMF();
- const SISubtarget *Subtarget = static_cast<const SISubtarget *>(&MF.getSubtarget());
+ const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+ const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
const DataLayout &DL = F.getParent()->getDataLayout();
SmallVector<CCValAssign, 16> ArgLocs;
@@ -116,7 +125,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
if (Info->hasKernargSegmentPtr()) {
unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
- const LLT P2 = LLT::pointer(2, 64);
+ const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
unsigned VReg = MRI.createGenericVirtualRegister(P2);
MRI.addLiveIn(InputPtrReg, VReg);
MIRBuilder.getMBB().addLiveIn(InputPtrReg);
@@ -136,49 +145,106 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
CCInfo.AllocateReg(FlatScratchInitReg);
}
+ // The infrastructure for normal calling convention lowering is essentially
+ // useless for kernels. We want to avoid any kind of legalization or argument
+ // splitting.
+ if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) {
+ unsigned i = 0;
+ const unsigned KernArgBaseAlign = 16;
+ const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
+ uint64_t ExplicitArgOffset = 0;
+
+ // TODO: Align down to dword alignment and extract bits for extending loads.
+ for (auto &Arg : F.args()) {
+ Type *ArgTy = Arg.getType();
+ unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+ if (AllocSize == 0)
+ continue;
+
+ unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
+
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
+
+ unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
+ ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
+ lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, VRegs[i]);
+ ++i;
+ }
+
+ return true;
+ }
+
unsigned NumArgs = F.arg_size();
Function::const_arg_iterator CurOrigArg = F.arg_begin();
const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
+ unsigned PSInputNum = 0;
+ BitVector Skipped(NumArgs);
for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType());
// We can only hanlde simple value types at the moment.
- if (!ValEVT.isSimple())
- return false;
- MVT ValVT = ValEVT.getSimpleVT();
ISD::ArgFlagsTy Flags;
ArgInfo OrigArg{VRegs[i], CurOrigArg->getType()};
setArgFlags(OrigArg, i + 1, DL, F);
Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
+
+ if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
+ !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() &&
+ PSInputNum <= 15) {
+ if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) {
+ Skipped.set(i);
+ ++PSInputNum;
+ continue;
+ }
+
+ Info->markPSInputAllocated(PSInputNum);
+ if (!CurOrigArg->use_empty())
+ Info->markPSInputEnabled(PSInputNum);
+
+ ++PSInputNum;
+ }
+
CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(),
/*IsVarArg=*/false);
- bool Res =
- AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo);
- // Fail if we don't know how to handle this type.
- if (Res)
- return false;
+ if (ValEVT.isVector()) {
+ EVT ElemVT = ValEVT.getVectorElementType();
+ if (!ValEVT.isSimple())
+ return false;
+ MVT ValVT = ElemVT.getSimpleVT();
+ bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full,
+ OrigArg.Flags, CCInfo);
+ if (!Res)
+ return false;
+ } else {
+ MVT ValVT = ValEVT.getSimpleVT();
+ if (!ValEVT.isSimple())
+ return false;
+ bool Res =
+ AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo);
+
+ // Fail if we don't know how to handle this type.
+ if (Res)
+ return false;
+ }
}
Function::const_arg_iterator Arg = F.arg_begin();
- if (F.getCallingConv() == CallingConv::AMDGPU_VS) {
- for (unsigned i = 0; i != NumArgs; ++i, ++Arg) {
- CCValAssign &VA = ArgLocs[i];
- MRI.addLiveIn(VA.getLocReg(), VRegs[i]);
+ if (F.getCallingConv() == CallingConv::AMDGPU_VS ||
+ F.getCallingConv() == CallingConv::AMDGPU_PS) {
+ for (unsigned i = 0, OrigArgIdx = 0;
+ OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) {
+ if (Skipped.test(OrigArgIdx))
+ continue;
+ CCValAssign &VA = ArgLocs[i++];
+ MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx]);
MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
- MIRBuilder.buildCopy(VRegs[i], VA.getLocReg());
+ MIRBuilder.buildCopy(VRegs[OrigArgIdx], VA.getLocReg());
}
return true;
}
- for (unsigned i = 0; i != NumArgs; ++i, ++Arg) {
- // FIXME: We should be getting DebugInfo from the arguments some how.
- CCValAssign &VA = ArgLocs[i];
- lowerParameter(MIRBuilder, Arg->getType(),
- VA.getLocMemOffset() +
- Subtarget->getExplicitKernelArgOffset(MF), VRegs[i]);
- }
-
- return true;
+ return false;
}
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 251cb7a2c440..f51cb6abbf65 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -26,10 +26,11 @@ class AMDGPUCallLowering: public CallLowering {
AMDGPUAS AMDGPUASI;
unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
- unsigned Offset) const;
+ uint64_t Offset) const;
void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
- unsigned Offset, unsigned DstReg) const;
+ uint64_t Offset, unsigned Align,
+ unsigned DstReg) const;
public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index c1c066fd1404..68bc7fdd9961 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -85,22 +85,6 @@ def RetCC_SI_Shader : CallingConv<[
]>>
]>;
-// Calling convention for R600
-def CC_R600 : CallingConv<[
- CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
- T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
- T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
- T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
- T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
- T30_XYZW, T31_XYZW, T32_XYZW
- ]>>>
-]>;
-
-// Calling convention for compute kernels
-def CC_AMDGPU_Kernel : CallingConv<[
- CCCustom<"allocateKernArg">
-]>;
-
def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
(sequence "VGPR%u", 24, 255)
>;
@@ -127,7 +111,7 @@ def CC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>,
+ CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
@@ -144,30 +128,16 @@ def RetCC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>
+ CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>
]>;
def CC_AMDGPU : CallingConv<[
- CCIf<"static_cast<const AMDGPUSubtarget&>"
- "(State.getMachineFunction().getSubtarget()).getGeneration() >="
- "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
- "!AMDGPU::isShader(State.getCallingConv())",
- CCDelegateTo<CC_AMDGPU_Kernel>>,
- CCIf<"static_cast<const AMDGPUSubtarget&>"
- "(State.getMachineFunction().getSubtarget()).getGeneration() < "
- "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
- "!AMDGPU::isShader(State.getCallingConv())",
- CCDelegateTo<CC_AMDGPU_Kernel>>,
- CCIf<"static_cast<const AMDGPUSubtarget&>"
+ CCIf<"static_cast<const GCNSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() >= "
"AMDGPUSubtarget::SOUTHERN_ISLANDS",
CCDelegateTo<CC_SI>>,
- CCIf<"static_cast<const AMDGPUSubtarget&>"
+ CCIf<"static_cast<const GCNSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() >= "
"AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
- CCDelegateTo<CC_AMDGPU_Func>>,
- CCIf<"static_cast<const AMDGPUSubtarget&>"
- "(State.getMachineFunction().getSubtarget()).getGeneration() < "
- "AMDGPUSubtarget::SOUTHERN_ISLANDS",
- CCDelegateTo<CC_R600>>
+ CCDelegateTo<CC_AMDGPU_Func>>
]>;
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index b17b67167666..5713b7b7f9a8 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -17,8 +17,10 @@
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
@@ -48,15 +50,22 @@ using namespace llvm;
namespace {
+static cl::opt<bool> WidenLoads(
+ "amdgpu-codegenprepare-widen-constant-loads",
+ cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden,
+ cl::init(true));
+
class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {
- const SISubtarget *ST = nullptr;
+ const GCNSubtarget *ST = nullptr;
+ AssumptionCache *AC = nullptr;
DivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;
bool HasUnsafeFPMath = false;
AMDGPUAS AMDGPUASI;
- /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
+ /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
/// binary operation \p V.
///
/// \returns Binary operation \p V.
@@ -80,7 +89,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// false otherwise.
bool needsPromotionToI32(const Type *T) const;
- /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
+ /// Promotes uniform binary operation \p I to equivalent 32 bit binary
/// operation.
///
/// \details \p I's base element bit width must be greater than 1 and less
@@ -93,7 +102,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// false otherwise.
bool promoteUniformOpToI32(BinaryOperator &I) const;
- /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
+ /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
///
/// \details \p I's base element bit width must be greater than 1 and less
/// than or equal 16. Promotion is done by sign or zero extending operands to
@@ -102,7 +111,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// \returns True.
bool promoteUniformOpToI32(ICmpInst &I) const;
- /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
+ /// Promotes uniform 'select' operation \p I to 32 bit 'select'
/// operation.
///
/// \details \p I's base element bit width must be greater than 1 and less
@@ -113,7 +122,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// \returns True.
bool promoteUniformOpToI32(SelectInst &I) const;
- /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
+ /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
/// intrinsic.
///
/// \details \p I's base element bit width must be greater than 1 and less
@@ -125,7 +134,17 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
///
/// \returns True.
bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
- /// \brief Widen a scalar load.
+
+ /// Expands 24 bit div or rem.
+ Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
+ Value *Num, Value *Den,
+ bool IsDiv, bool IsSigned) const;
+
+ /// Expands 32 bit div or rem.
+ Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
+ Value *Num, Value *Den) const;
+
+ /// Widen a scalar load.
///
/// \details \p Widen scalar load for uniform, small type loads from constant
// memory / to a full 32-bits and then truncate the input to allow a scalar
@@ -157,6 +176,7 @@ public:
StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DivergenceAnalysis>();
AU.setPreservesAll();
}
@@ -250,7 +270,9 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
"I does not need promotion to i32");
if (I.getOpcode() == Instruction::SDiv ||
- I.getOpcode() == Instruction::UDiv)
+ I.getOpcode() == Instruction::UDiv ||
+ I.getOpcode() == Instruction::SRem ||
+ I.getOpcode() == Instruction::URem)
return false;
IRBuilder<> Builder(&I);
@@ -372,13 +394,18 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
return true;
}
-static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
if (!CNum)
- return false;
+ return HasDenormals;
+
+ if (UnsafeDiv)
+ return true;
+
+ bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
// Reciprocal f32 is handled separately without denormals.
- return UnsafeDiv || CNum->isExactlyValue(+1.0);
+ return HasDenormals ^ IsOne;
}
// Insert an intrinsic for fast fdiv for safe math situations where we can
@@ -404,7 +431,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
FMF.allowReciprocal();
// With UnsafeDiv node will be optimized to just rcp and mul.
- if (ST->hasFP32Denormals() || UnsafeDiv)
+ if (UnsafeDiv)
return false;
IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
@@ -418,6 +445,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
Value *NewFDiv = nullptr;
+ bool HasDenormals = ST->hasFP32Denormals();
if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
NewFDiv = UndefValue::get(VT);
@@ -428,7 +456,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
Value *DenEltI = Builder.CreateExtractElement(Den, I);
Value *NewElt;
- if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+ if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
} else {
NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
@@ -437,7 +465,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
}
} else {
- if (!shouldKeepFDivF32(Num, UnsafeDiv))
+ if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
NewFDiv = Builder.CreateCall(Decl, { Num, Den });
}
@@ -447,7 +475,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
FDiv.eraseFromParent();
}
- return true;
+ return !!NewFDiv;
}
static bool hasUnsafeFPMath(const Function &F) {
@@ -455,18 +483,324 @@ static bool hasUnsafeFPMath(const Function &F) {
return Attr.getValueAsString() == "true";
}
+static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
+ Value *LHS, Value *RHS) {
+ Type *I32Ty = Builder.getInt32Ty();
+ Type *I64Ty = Builder.getInt64Ty();
+
+ Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
+ Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
+ Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
+ Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
+ Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
+ Hi = Builder.CreateTrunc(Hi, I32Ty);
+ return std::make_pair(Lo, Hi);
+}
+
+static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
+ return getMul64(Builder, LHS, RHS).second;
+}
+
+// The fractional part of a float is enough to accurately represent up to
+// a 24-bit signed integer.
+Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
+ BinaryOperator &I,
+ Value *Num, Value *Den,
+ bool IsDiv, bool IsSigned) const {
+ assert(Num->getType()->isIntegerTy(32));
+
+ const DataLayout &DL = Mod->getDataLayout();
+ unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
+ if (LHSSignBits < 9)
+ return nullptr;
+
+ unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
+ if (RHSSignBits < 9)
+ return nullptr;
+
+
+ unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
+ unsigned DivBits = 32 - SignBits;
+ if (IsSigned)
+ ++DivBits;
+
+ Type *Ty = Num->getType();
+ Type *I32Ty = Builder.getInt32Ty();
+ Type *F32Ty = Builder.getFloatTy();
+ ConstantInt *One = Builder.getInt32(1);
+ Value *JQ = One;
+
+ if (IsSigned) {
+ // char|short jq = ia ^ ib;
+ JQ = Builder.CreateXor(Num, Den);
+
+ // jq = jq >> (bitsize - 2)
+ JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
+
+ // jq = jq | 0x1
+ JQ = Builder.CreateOr(JQ, One);
+ }
+
+ // int ia = (int)LHS;
+ Value *IA = Num;
+
+ // int ib, (int)RHS;
+ Value *IB = Den;
+
+ // float fa = (float)ia;
+ Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
+ : Builder.CreateUIToFP(IA, F32Ty);
+
+ // float fb = (float)ib;
+ Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
+ : Builder.CreateUIToFP(IB,F32Ty);
+
+ Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB);
+ Value *FQM = Builder.CreateFMul(FA, RCP);
+
+ // fq = trunc(fqm);
+ CallInst* FQ = Builder.CreateIntrinsic(Intrinsic::trunc, { FQM });
+ FQ->copyFastMathFlags(Builder.getFastMathFlags());
+
+ // float fqneg = -fq;
+ Value *FQNeg = Builder.CreateFNeg(FQ);
+
+ // float fr = mad(fqneg, fb, fa);
+ Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
+ { FQNeg, FB, FA }, FQ);
+
+ // int iq = (int)fq;
+ Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
+ : Builder.CreateFPToUI(FQ, I32Ty);
+
+ // fr = fabs(fr);
+ FR = Builder.CreateIntrinsic(Intrinsic::fabs, { FR }, FQ);
+
+ // fb = fabs(fb);
+ FB = Builder.CreateIntrinsic(Intrinsic::fabs, { FB }, FQ);
+
+ // int cv = fr >= fb;
+ Value *CV = Builder.CreateFCmpOGE(FR, FB);
+
+ // jq = (cv ? jq : 0);
+ JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
+
+ // dst = iq + jq;
+ Value *Div = Builder.CreateAdd(IQ, JQ);
+
+ Value *Res = Div;
+ if (!IsDiv) {
+ // Rem needs compensation, it's easier to recompute it
+ Value *Rem = Builder.CreateMul(Div, Den);
+ Res = Builder.CreateSub(Num, Rem);
+ }
+
+ // Truncate to number of bits this divide really is.
+ if (IsSigned) {
+ Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits));
+ Res = Builder.CreateSExt(Res, Ty);
+ } else {
+ ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
+ Res = Builder.CreateAnd(Res, TruncMask);
+ }
+
+ return Res;
+}
+
+Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
+ BinaryOperator &I,
+ Value *Num, Value *Den) const {
+ Instruction::BinaryOps Opc = I.getOpcode();
+ assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
+ Opc == Instruction::SRem || Opc == Instruction::SDiv);
+
+ FastMathFlags FMF;
+ FMF.setFast();
+ Builder.setFastMathFlags(FMF);
+
+ if (isa<Constant>(Den))
+ return nullptr; // Keep it for optimization
+
+ bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
+ bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
+
+ Type *Ty = Num->getType();
+ Type *I32Ty = Builder.getInt32Ty();
+ Type *F32Ty = Builder.getFloatTy();
+
+ if (Ty->getScalarSizeInBits() < 32) {
+ if (IsSigned) {
+ Num = Builder.CreateSExt(Num, I32Ty);
+ Den = Builder.CreateSExt(Den, I32Ty);
+ } else {
+ Num = Builder.CreateZExt(Num, I32Ty);
+ Den = Builder.CreateZExt(Den, I32Ty);
+ }
+ }
+
+ if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) {
+ Res = Builder.CreateTrunc(Res, Ty);
+ return Res;
+ }
+
+ ConstantInt *Zero = Builder.getInt32(0);
+ ConstantInt *One = Builder.getInt32(1);
+ ConstantInt *MinusOne = Builder.getInt32(~0);
+
+ Value *Sign = nullptr;
+ if (IsSigned) {
+ ConstantInt *K31 = Builder.getInt32(31);
+ Value *LHSign = Builder.CreateAShr(Num, K31);
+ Value *RHSign = Builder.CreateAShr(Den, K31);
+ // Remainder sign is the same as LHS
+ Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign;
+
+ Num = Builder.CreateAdd(Num, LHSign);
+ Den = Builder.CreateAdd(Den, RHSign);
+
+ Num = Builder.CreateXor(Num, LHSign);
+ Den = Builder.CreateXor(Den, RHSign);
+ }
+
+ // RCP = URECIP(Den) = 2^32 / Den + e
+ // e is rounding error.
+ Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty);
+ Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32);
+ Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000));
+ Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1);
+ Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty);
+
+ // RCP_LO, RCP_HI = mul(RCP, Den) */
+ Value *RCP_LO, *RCP_HI;
+ std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den);
+
+ // NEG_RCP_LO = -RCP_LO
+ Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO);
+
+ // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
+ Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero);
+ Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO);
+
+ // Calculate the rounding error from the URECIP instruction
+ // E = mulhu(ABS_RCP_LO, RCP)
+ Value *E = getMulHu(Builder, ABS_RCP_LO, RCP);
+
+ // RCP_A_E = RCP + E
+ Value *RCP_A_E = Builder.CreateAdd(RCP, E);
+
+ // RCP_S_E = RCP - E
+ Value *RCP_S_E = Builder.CreateSub(RCP, E);
+
+ // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
+ Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E);
+
+ // Quotient = mulhu(Tmp0, Num)
+ Value *Quotient = getMulHu(Builder, Tmp0, Num);
+
+ // Num_S_Remainder = Quotient * Den
+ Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den);
+
+ // Remainder = Num - Num_S_Remainder
+ Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder);
+
+ // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
+ Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den);
+ Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero);
+
+ // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
+ Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder);
+ Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC,
+ MinusOne, Zero);
+
+ // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
+ Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero);
+ Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero);
+
+ Value *Res;
+ if (IsDiv) {
+ // Quotient_A_One = Quotient + 1
+ Value *Quotient_A_One = Builder.CreateAdd(Quotient, One);
+
+ // Quotient_S_One = Quotient - 1
+ Value *Quotient_S_One = Builder.CreateSub(Quotient, One);
+
+ // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
+ Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One);
+
+ // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
+ Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One);
+ } else {
+ // Remainder_S_Den = Remainder - Den
+ Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den);
+
+ // Remainder_A_Den = Remainder + Den
+ Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den);
+
+ // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
+ Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den);
+
+ // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
+ Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den);
+ }
+
+ if (IsSigned) {
+ Res = Builder.CreateXor(Res, Sign);
+ Res = Builder.CreateSub(Res, Sign);
+ }
+
+ Res = Builder.CreateTrunc(Res, Ty);
+
+ return Res;
+}
+
bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+ DA->isUniform(&I) && promoteUniformOpToI32(I))
+ return true;
+
bool Changed = false;
+ Instruction::BinaryOps Opc = I.getOpcode();
+ Type *Ty = I.getType();
+ Value *NewDiv = nullptr;
+ if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
+ Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
+ Ty->getScalarSizeInBits() <= 32) {
+ Value *Num = I.getOperand(0);
+ Value *Den = I.getOperand(1);
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
- if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
- DA->isUniform(&I))
- Changed |= promoteUniformOpToI32(I);
+ if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+ NewDiv = UndefValue::get(VT);
+
+ for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
+ Value *NumEltN = Builder.CreateExtractElement(Num, N);
+ Value *DenEltN = Builder.CreateExtractElement(Den, N);
+ Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
+ if (!NewElt)
+ NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+ NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
+ }
+ } else {
+ NewDiv = expandDivRem32(Builder, I, Num, Den);
+ }
+
+ if (NewDiv) {
+ I.replaceAllUsesWith(NewDiv);
+ I.eraseFromParent();
+ Changed = true;
+ }
+ }
return Changed;
}
-bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
- if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
+ if (!WidenLoads)
+ return false;
+
+ if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+ I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
canWidenScalarExtLoad(I)) {
IRBuilder<> Builder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());
@@ -474,7 +808,28 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
Type *I32Ty = Builder.getInt32Ty();
Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
- Value *WidenLoad = Builder.CreateLoad(BitCast);
+ LoadInst *WidenLoad = Builder.CreateLoad(BitCast);
+ WidenLoad->copyMetadata(I);
+
+ // If we have range metadata, we need to convert the type, and not make
+ // assumptions about the high bits.
+ if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
+ ConstantInt *Lower =
+ mdconst::extract<ConstantInt>(Range->getOperand(0));
+
+ if (Lower->getValue().isNullValue()) {
+ WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
+ } else {
+ Metadata *LowAndHigh[] = {
+ ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
+ // Don't make assumptions about the high bits.
+ ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
+ };
+
+ WidenLoad->setMetadata(LLVMContext::MD_range,
+ MDNode::get(Mod->getContext(), LowAndHigh));
+ }
+ }
int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
Type *IntNTy = Builder.getIntNTy(TySize);
@@ -540,10 +895,12 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
if (!TPC)
return false;
- const TargetMachine &TM = TPC->getTM<TargetMachine>();
- ST = &TM.getSubtarget<SISubtarget>(F);
+ const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
+ ST = &TM.getSubtarget<GCNSubtarget>(F);
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
DA = &getAnalysis<DivergenceAnalysis>();
HasUnsafeFPMath = hasUnsafeFPMath(F);
+ AMDGPUASI = TM.getAMDGPUAS();
bool MadeChange = false;
@@ -560,6 +917,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
"AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
false, false)
diff --git a/lib/Target/AMDGPU/AMDGPUFeatures.td b/lib/Target/AMDGPU/AMDGPUFeatures.td
new file mode 100644
index 000000000000..b375cae9018e
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -0,0 +1,60 @@
+//===-- AMDGPUFeatures.td - AMDGPU Feature Definitions -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureFP64 : SubtargetFeature<"fp64",
+ "FP64",
+ "true",
+ "Enable double precision operations"
+>;
+
+def FeatureFMA : SubtargetFeature<"fmaf",
+ "FMA",
+ "true",
+ "Enable single precision FMA (not as fast as mul+add, but fused)"
+>;
+
+class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
+ "localmemorysize"#Value,
+ "LocalMemorySize",
+ !cast<string>(Value),
+ "The size of local memory in bytes"
+>;
+
+def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
+def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
+def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
+
+class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
+ "wavefrontsize"#Value,
+ "WavefrontSize",
+ !cast<string>(Value),
+ "The number of threads per wavefront"
+>;
+
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+
+class SubtargetFeatureGeneration <string Value, string Subtarget,
+ list<SubtargetFeature> Implies> :
+ SubtargetFeature <Value, "Gen", Subtarget#"::"#Value,
+ Value#" GPU generation", Implies>;
+
+def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
+ "DX10Clamp",
+ "true",
+ "clamp modifier clamps NaNs to 0.0"
+>;
+
+def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
+ "EnablePromoteAlloca",
+ "true",
+ "Enable promote alloca pass"
+>;
+
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 91fe921bfeec..ee836bf8a631 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface to describe a layout of a stack frame on an AMDGPU target.
+/// Interface to describe a layout of a stack frame on an AMDGPU target.
//
//===----------------------------------------------------------------------===//
@@ -19,7 +19,7 @@
namespace llvm {
-/// \brief Information about the stack frame layout on the AMDGPU targets.
+/// Information about the stack frame layout on the AMDGPU targets.
///
/// It holds the direction of the stack growth, the known stack alignment on
/// entry to each function, and the offset to the locals area.
diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td
new file mode 100644
index 000000000000..ba735390f679
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -0,0 +1,138 @@
+//===-- AMDGPUGIsel.td - AMDGPU GlobalISel Patterns---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This files contains patterns that should only be used by GlobalISel. For
+// example patterns for V_* instructions that have S_* equivalents.
+// SelectionDAG does not support selecting V_* instructions.
+//===----------------------------------------------------------------------===//
+
+include "AMDGPU.td"
+
+def sd_vsrc0 : ComplexPattern<i32, 1, "">;
+def gi_vsrc0 :
+ GIComplexOperandMatcher<s32, "selectVSRC0">,
+ GIComplexPatternEquiv<sd_vsrc0>;
+
+def sd_vcsrc : ComplexPattern<i32, 1, "">;
+def gi_vcsrc :
+ GIComplexOperandMatcher<s32, "selectVCSRC">,
+ GIComplexPatternEquiv<sd_vcsrc>;
+
+def gi_vop3mods0 :
+ GIComplexOperandMatcher<s32, "selectVOP3Mods0">,
+ GIComplexPatternEquiv<VOP3Mods0>;
+
+def gi_vop3mods :
+ GIComplexOperandMatcher<s32, "selectVOP3Mods">,
+ GIComplexPatternEquiv<VOP3Mods>;
+
+def gi_vop3omods :
+ GIComplexOperandMatcher<s32, "selectVOP3OMods">,
+ GIComplexPatternEquiv<VOP3OMods>;
+
+class GISelSop2Pat <
+ SDPatternOperator node,
+ Instruction inst,
+ ValueType dst_vt,
+ ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+ (dst_vt (node (src0_vt SReg_32:$src0), (src1_vt SReg_32:$src1))),
+ (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop2Pat <
+ SDPatternOperator node,
+ Instruction inst,
+ ValueType dst_vt,
+ ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+ (dst_vt (node (src0_vt (sd_vsrc0 src0_vt:$src0)), (src1_vt VGPR_32:$src1))),
+ (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop2CommutePat <
+ SDPatternOperator node,
+ Instruction inst,
+ ValueType dst_vt,
+ ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+ (dst_vt (node (src1_vt VGPR_32:$src1), (src0_vt (sd_vsrc0 src0_vt:$src0)))),
+ (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop3Pat2 <
+ SDPatternOperator node,
+ Instruction inst,
+ ValueType dst_vt,
+ ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+ (dst_vt (node (src0_vt (sd_vcsrc src0_vt:$src0)), (src1_vt (sd_vcsrc src1_vt:$src1)))),
+ (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop3Pat2CommutePat <
+ SDPatternOperator node,
+ Instruction inst,
+ ValueType dst_vt,
+ ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+ (dst_vt (node (src0_vt (sd_vcsrc src0_vt:$src0)), (src1_vt (sd_vcsrc src1_vt:$src1)))),
+ (inst src0_vt:$src1, src1_vt:$src0)
+>;
+
+class GISelVop3Pat2ModsPat <
+ SDPatternOperator node,
+ Instruction inst,
+ ValueType dst_vt,
+ ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+ (dst_vt (node (src0_vt (VOP3Mods0 src0_vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omods)),
+ (src1_vt (VOP3Mods src1_vt:$src1, i32:$src1_modifiers)))),
+ (inst i32:$src0_modifiers, src0_vt:$src0,
+ i32:$src1_modifiers, src1_vt:$src1, $clamp, $omods)
+>;
+
+multiclass GISelVop2IntrPat <
+ SDPatternOperator node, Instruction inst,
+ ValueType dst_vt, ValueType src_vt = dst_vt> {
+
+ def : GISelVop2Pat <node, inst, dst_vt, src_vt>;
+
+ // FIXME: Intrinsics aren't marked as commutable, so we need to add an explcit
+ // pattern to handle commuting. This is another reason why legalizing to a
+ // generic machine instruction may be better that matching the intrinsic
+ // directly.
+ def : GISelVop2CommutePat <node, inst, dst_vt, src_vt>;
+}
+
+def : GISelSop2Pat <or, S_OR_B32, i32>;
+def : GISelVop2Pat <or, V_OR_B32_e32, i32>;
+
+def : GISelSop2Pat <sra, S_ASHR_I32, i32>;
+let AddedComplexity = 100 in {
+let SubtargetPredicate = isSICI in {
+def : GISelVop2Pat <sra, V_ASHR_I32_e32, i32>;
+}
+def : GISelVop2CommutePat <sra, V_ASHRREV_I32_e32, i32>;
+}
+def : GISelVop3Pat2CommutePat <sra, V_ASHRREV_I32_e64, i32>;
+
+// FIXME: Select directly to _e32 so we don't need to deal with modifiers.
+// FIXME: We can't re-use SelectionDAG patterns here because they match
+// against a custom SDNode and we would need to create a generic machine
+// instruction that is equivalent to the custom SDNode. This would also require
+// us to custom legalize the intrinsic to the new generic machine instruction,
+// but I can't get custom legalizing of intrinsic to work and I'm not sure if
+// this is even supported yet.
+defm : GISelVop2IntrPat <
+ int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e32, v2f16, f32>;
+
+defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>;
+def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>;
+defm : GISelVop2IntrPat <int_minnum, V_MIN_F32_e32, f32>;
+def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>;
diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index bf7deb500d1a..3a58c6c6a29f 100644
--- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -16,41 +16,89 @@ namespace AMDGPU {
enum PartialMappingIdx {
None = - 1,
- PM_SGPR32 = 0,
- PM_SGPR64 = 1,
- PM_VGPR32 = 2,
- PM_VGPR64 = 3
+ PM_SGPR1 = 0,
+ PM_SGPR16 = 4,
+ PM_SGPR32 = 5,
+ PM_SGPR64 = 6,
+ PM_SGPR128 = 7,
+ PM_SGPR256 = 8,
+ PM_SGPR512 = 9,
+ PM_VGPR1 = 10,
+ PM_VGPR16 = 14,
+ PM_VGPR32 = 15,
+ PM_VGPR64 = 16,
+ PM_VGPR128 = 17,
+ PM_VGPR256 = 18,
+ PM_VGPR512 = 19,
+ PM_SGPR96 = 20,
+ PM_VGPR96 = 21
};
const RegisterBankInfo::PartialMapping PartMappings[] {
// StartIdx, Length, RegBank
+ {0, 1, SCCRegBank},
+ {0, 16, SGPRRegBank},
{0, 32, SGPRRegBank},
{0, 64, SGPRRegBank},
+ {0, 128, SGPRRegBank},
+ {0, 256, SGPRRegBank},
+ {0, 512, SGPRRegBank},
+ {0, 1, SGPRRegBank},
+ {0, 16, VGPRRegBank},
{0, 32, VGPRRegBank},
- {0, 64, VGPRRegBank}
+ {0, 64, VGPRRegBank},
+ {0, 128, VGPRRegBank},
+ {0, 256, VGPRRegBank},
+ {0, 512, VGPRRegBank},
+ {0, 96, SGPRRegBank},
+ {0, 96, VGPRRegBank},
};
const RegisterBankInfo::ValueMapping ValMappings[] {
- // SGPR 32-bit
{&PartMappings[0], 1},
- // SGPR 64-bit
+ {nullptr, 0},
+ {nullptr, 0},
+ {nullptr, 0},
{&PartMappings[1], 1},
- // VGPR 32-bit
{&PartMappings[2], 1},
- // VGPR 64-bit
- {&PartMappings[3], 1}
+ {&PartMappings[3], 1},
+ {&PartMappings[4], 1},
+ {&PartMappings[5], 1},
+ {&PartMappings[6], 1},
+ {&PartMappings[7], 1},
+ {nullptr, 0},
+ {nullptr, 0},
+ {nullptr, 0},
+ {&PartMappings[8], 1},
+ {&PartMappings[9], 1},
+ {&PartMappings[10], 1},
+ {&PartMappings[11], 1},
+ {&PartMappings[12], 1},
+ {&PartMappings[13], 1},
+ {&PartMappings[14], 1},
+ {&PartMappings[15], 1}
};
enum ValueMappingIdx {
SGPRStartIdx = 0,
- VGPRStartIdx = 2
+ VGPRStartIdx = 10
};
const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
unsigned Size) {
- assert(Size % 32 == 0);
- unsigned Idx = BankID == AMDGPU::SGPRRegBankID ? SGPRStartIdx : VGPRStartIdx;
- Idx += (Size / 32) - 1;
+ unsigned Idx;
+ switch (Size) {
+ case 1:
+ Idx = BankID == AMDGPU::SCCRegBankID ? PM_SGPR1 : PM_VGPR1;
+ break;
+ case 96:
+ Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR96 : PM_VGPR96;
+ break;
+ default:
+ Idx = BankID == AMDGPU::VGPRRegBankID ? VGPRStartIdx : SGPRStartIdx;
+ Idx += Log2_32_Ceil(Size);
+ break;
+ }
return &ValMappings[Idx];
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 463e700f13b7..01ef346f74ee 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -8,13 +8,17 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU HSA Metadata Streamer.
+/// AMDGPU HSA Metadata Streamer.
///
//
//===----------------------------------------------------------------------===//
#include "AMDGPUHSAMetadataStreamer.h"
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIProgramInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Module.h"
@@ -196,6 +200,57 @@ std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
return Dims;
}
+Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps(
+ const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const {
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
+ const Function &F = MF.getFunction();
+
+ assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+ F.getCallingConv() == CallingConv::SPIR_KERNEL);
+
+ unsigned MaxKernArgAlign;
+ HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F,
+ MaxKernArgAlign);
+ HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
+ HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
+ HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u);
+ HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
+ HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR;
+ HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR;
+ HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize();
+ HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack;
+ HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled();
+ HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs();
+ HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs();
+
+ return HSACodeProps;
+}
+
+Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps(
+ const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const {
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
+
+ if (!STM.debuggerSupported())
+ return HSADebugProps;
+
+ HSADebugProps.mDebuggerABIVersion.push_back(1);
+ HSADebugProps.mDebuggerABIVersion.push_back(0);
+
+ if (STM.debuggerEmitPrologue()) {
+ HSADebugProps.mPrivateSegmentBufferSGPR =
+ ProgramInfo.DebuggerPrivateSegmentBufferSGPR;
+ HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR =
+ ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+ }
+
+ return HSADebugProps;
+}
+
void MetadataStreamer::emitVersion() {
auto &Version = HSAMetadata.mVersion;
@@ -255,32 +310,7 @@ void MetadataStreamer::emitKernelArgs(const Function &Func) {
for (auto &Arg : Func.args())
emitKernelArg(Arg);
- // TODO: What about other languages?
- if (!Func.getParent()->getNamedMetadata("opencl.ocl.version"))
- return;
-
- auto &DL = Func.getParent()->getDataLayout();
- auto Int64Ty = Type::getInt64Ty(Func.getContext());
-
- emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX);
- emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
- emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
-
- auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
- AMDGPUASI.GLOBAL_ADDRESS);
- auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts");
- if (CallsPrintf)
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
- if (Func.hasFnAttribute("calls-enqueue-kernel")) {
- if (!CallsPrintf) {
- // Emit a dummy argument so that the remaining hidden arguments
- // have a fixed position relative to the first hidden argument.
- // This is to facilitate library code to access hidden arguments.
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
- }
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
- }
+ emitHiddenKernelArgs(Func);
}
void MetadataStreamer::emitKernelArg(const Argument &Arg) {
@@ -320,13 +350,26 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) {
if (Node && ArgNo < Node->getNumOperands())
TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
- emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
- getValueKind(Arg.getType(), TypeQual, BaseTypeName), Name,
- TypeName, BaseTypeName, AccQual, TypeQual);
+ Type *Ty = Arg.getType();
+ const DataLayout &DL = Func->getParent()->getDataLayout();
+
+ unsigned PointeeAlign = 0;
+ if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+ if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
+ PointeeAlign = Arg.getParamAlignment();
+ if (PointeeAlign == 0)
+ PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType());
+ }
+ }
+
+ emitKernelArg(DL, Ty, getValueKind(Arg.getType(), TypeQual, BaseTypeName),
+ PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual);
}
void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
- ValueKind ValueKind, StringRef Name,
+ ValueKind ValueKind,
+ unsigned PointeeAlign,
+ StringRef Name,
StringRef TypeName, StringRef BaseTypeName,
StringRef AccQual, StringRef TypeQual) {
HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
@@ -338,12 +381,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
Arg.mAlign = DL.getABITypeAlignment(Ty);
Arg.mValueKind = ValueKind;
Arg.mValueType = getValueType(Ty, BaseTypeName);
-
- if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
- auto ElTy = PtrTy->getElementType();
- if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS && ElTy->isSized())
- Arg.mPointeeAlign = DL.getABITypeAlignment(ElTy);
- }
+ Arg.mPointeeAlign = PointeeAlign;
if (auto PtrTy = dyn_cast<PointerType>(Ty))
Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace());
@@ -366,6 +404,48 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
}
}
+void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) {
+ int HiddenArgNumBytes =
+ getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
+
+ if (!HiddenArgNumBytes)
+ return;
+
+ auto &DL = Func.getParent()->getDataLayout();
+ auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+ if (HiddenArgNumBytes >= 8)
+ emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX);
+ if (HiddenArgNumBytes >= 16)
+ emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
+ if (HiddenArgNumBytes >= 24)
+ emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
+
+ auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
+ AMDGPUASI.GLOBAL_ADDRESS);
+
+ // Emit "printf buffer" argument if printf is used, otherwise emit dummy
+ // "none" argument.
+ if (HiddenArgNumBytes >= 32) {
+ if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+ else
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+ }
+
+ // Emit "default queue" and "completion action" arguments if enqueue kernel is
+ // used, otherwise emit dummy "none" arguments.
+ if (HiddenArgNumBytes >= 48) {
+ if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
+ } else {
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+ }
+ }
+}
+
void MetadataStreamer::begin(const Module &Mod) {
AMDGPUASI = getAMDGPUAS(Mod);
emitVersion();
@@ -383,13 +463,14 @@ void MetadataStreamer::end() {
verify(HSAMetadataString);
}
-void MetadataStreamer::emitKernel(
- const Function &Func,
- const Kernel::CodeProps::Metadata &CodeProps,
- const Kernel::DebugProps::Metadata &DebugProps) {
+void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) {
+ auto &Func = MF.getFunction();
if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
return;
+ auto CodeProps = getHSACodeProps(MF, ProgramInfo);
+ auto DebugProps = getHSADebugProps(MF, ProgramInfo);
+
HSAMetadata.mKernels.push_back(Kernel::Metadata());
auto &Kernel = HSAMetadata.mKernels.back();
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index bd6515521a74..3424c956d781 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU HSA Metadata Streamer.
+/// AMDGPU HSA Metadata Streamer.
///
//
//===----------------------------------------------------------------------===//
@@ -28,6 +28,7 @@ class DataLayout;
class Function;
class MDNode;
class Module;
+struct SIProgramInfo;
class Type;
namespace AMDGPU {
@@ -55,6 +56,13 @@ private:
std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const;
+ Kernel::CodeProps::Metadata getHSACodeProps(
+ const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const;
+ Kernel::DebugProps::Metadata getHSADebugProps(
+ const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const;
+
void emitVersion();
void emitPrintf(const Module &Mod);
@@ -68,10 +76,13 @@ private:
void emitKernelArg(const Argument &Arg);
void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind,
+ unsigned PointeeAlign = 0,
StringRef Name = "", StringRef TypeName = "",
StringRef BaseTypeName = "", StringRef AccQual = "",
StringRef TypeQual = "");
+ void emitHiddenKernelArgs(const Function &Func);
+
public:
MetadataStreamer() = default;
~MetadataStreamer() = default;
@@ -84,9 +95,7 @@ public:
void end();
- void emitKernel(const Function &Func,
- const Kernel::CodeProps::Metadata &CodeProps,
- const Kernel::DebugProps::Metadata &DebugProps);
+ void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo);
};
} // end namespace HSAMD
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index f4776adb069c..f25f4d4693ea 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -8,7 +8,7 @@
//==-----------------------------------------------------------------------===//
//
/// \file
-/// \brief Defines an instruction selector for the AMDGPU target.
+/// Defines an instruction selector for the AMDGPU target.
//
//===----------------------------------------------------------------------===//
@@ -16,6 +16,7 @@
#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUISelLowering.h" // For AMDGPUISD
#include "AMDGPUInstrInfo.h"
+#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
@@ -24,15 +25,16 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -43,6 +45,7 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include <cassert>
#include <cstdint>
@@ -68,7 +71,7 @@ namespace {
class AMDGPUDAGToDAGISel : public SelectionDAGISel {
// Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
// make the right decision when generating code for different targets.
- const AMDGPUSubtarget *Subtarget;
+ const GCNSubtarget *Subtarget;
AMDGPUAS AMDGPUASI;
bool EnableLateStructurizeCFG;
@@ -83,6 +86,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AMDGPUArgumentUsageInfo>();
+ AU.addRequired<AMDGPUPerfHintAnalysis>();
+ AU.addRequired<DivergenceAnalysis>();
SelectionDAGISel::getAnalysisUsage(AU);
}
@@ -98,20 +103,12 @@ private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
bool isNoNanSrc(SDValue N) const;
bool isInlineImmediate(const SDNode *N) const;
- bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
- const R600InstrInfo *TII);
- bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
- bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
- bool isConstantLoad(const MemSDNode *N, int cbID) const;
bool isUniformBr(const SDNode *N) const;
SDNode *glueCopyToM0(SDNode *N) const;
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
- bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
- bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
- SDValue& Offset);
virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
@@ -162,6 +159,7 @@ private:
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
bool &Imm) const;
+ SDValue Expand32BitAddress(SDValue Addr) const;
bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
bool &Imm) const;
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
@@ -216,7 +214,7 @@ private:
void SelectS_BFE(SDNode *N);
bool isCBranchSCC(const SDNode *N) const;
void SelectBRCOND(SDNode *N);
- void SelectFMAD(SDNode *N);
+ void SelectFMAD_FMA(SDNode *N);
void SelectATOMIC_CMP_SWAP(SDNode *N);
protected:
@@ -225,9 +223,18 @@ protected:
};
class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
+ const R600Subtarget *Subtarget;
+ AMDGPUAS AMDGPUASI;
+
+ bool isConstantLoad(const MemSDNode *N, int cbID) const;
+ bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
+ bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+ SDValue& Offset);
public:
explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) :
- AMDGPUDAGToDAGISel(TM, OptLevel) {}
+ AMDGPUDAGToDAGISel(TM, OptLevel) {
+ AMDGPUASI = AMDGPU::getAMDGPUAS(*TM);
+ }
void Select(SDNode *N) override;
@@ -235,6 +242,11 @@ public:
SDValue &Offset) override;
bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
SDValue &Offset) override;
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+protected:
+ // Include the pieces autogenerated from the target description.
+#include "R600GenDAGISel.inc"
};
} // end anonymous namespace
@@ -242,17 +254,19 @@ public:
INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
-/// \brief This pass converts a legalized DAG into a AMDGPU-specific
+/// This pass converts a legalized DAG into a AMDGPU-specific
// DAG, ready for instruction scheduling.
FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
CodeGenOpt::Level OptLevel) {
return new AMDGPUDAGToDAGISel(TM, OptLevel);
}
-/// \brief This pass converts a legalized DAG into a R600-specific
+/// This pass converts a legalized DAG into a R600-specific
// DAG, ready for instruction scheduling.
FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
CodeGenOpt::Level OptLevel) {
@@ -260,7 +274,7 @@ FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
}
bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
- Subtarget = &MF.getSubtarget<AMDGPUSubtarget>();
+ Subtarget = &MF.getSubtarget<GCNSubtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -276,8 +290,7 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
}
bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
- const SIInstrInfo *TII
- = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo();
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
return TII->isInlineConstant(C->getAPIntValue());
@@ -288,7 +301,7 @@ bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
return false;
}
-/// \brief Determine the register class for \p OpNo
+/// Determine the register class for \p OpNo
/// \returns The register class of the virtual register that will be used for
/// the given operand number \OpNo or NULL if the register class cannot be
/// determined.
@@ -303,7 +316,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
}
const SIRegisterInfo *TRI
- = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+ = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
return TRI->getPhysRegClass(Reg);
}
@@ -394,7 +407,6 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
EVT EltVT = VT.getVectorElementType();
- const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
SDLoc DL(N);
SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
@@ -420,10 +432,9 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
IsRegSeq = false;
break;
}
+ unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
- RegSeqArgs[1 + (2 * i) + 1] =
- CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
- MVT::i32);
+ RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
}
if (NOps != NumVectorElts) {
// Fill in the missing undef elements if this was a scalar_to_vector.
@@ -431,9 +442,10 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
DL, EltVT);
for (unsigned i = NOps; i < NumVectorElts; ++i) {
+ unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
RegSeqArgs[1 + (2 * i) + 1] =
- CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
+ CurDAG->getTargetConstant(Sub, DL, MVT::i32);
}
}
@@ -450,7 +462,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
if (isa<AtomicSDNode>(N) ||
- (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC))
+ (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
+ Opc == AMDGPUISD::ATOMIC_LOAD_FADD ||
+ Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
+ Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
N = glueCopyToM0(N);
switch (Opc) {
@@ -487,9 +502,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::BUILD_VECTOR: {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
-
- if (VT == MVT::v2i16 || VT == MVT::v2f16) {
- if (Opc == ISD::BUILD_VECTOR) {
+ if (VT.getScalarSizeInBits() == 16) {
+ if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
uint32_t LHSVal, RHSVal;
if (getConstantValue(N->getOperand(0), LHSVal) &&
getConstantValue(N->getOperand(1), RHSVal)) {
@@ -559,7 +573,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
return;
}
case ISD::LOAD:
- case ISD::STORE: {
+ case ISD::STORE:
+ case ISD::ATOMIC_LOAD:
+ case ISD::ATOMIC_STORE: {
N = glueCopyToM0(N);
break;
}
@@ -619,7 +635,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectBRCOND(N);
return;
case ISD::FMAD:
- SelectFMAD(N);
+ case ISD::FMA:
+ SelectFMAD_FMA(N);
return;
case AMDGPUISD::ATOMIC_CMP_SWAP:
SelectATOMIC_CMP_SWAP(N);
@@ -629,15 +646,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectCode(N);
}
-bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
- if (!N->readMem())
- return false;
- if (CbId == -1)
- return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
-
- return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
-}
-
bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
const Instruction *Term = BB->getTerminator();
@@ -653,26 +661,6 @@ StringRef AMDGPUDAGToDAGISel::getPassName() const {
// Complex Patterns
//===----------------------------------------------------------------------===//
-bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
- SDValue& IntPtr) {
- if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
- IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
- true);
- return true;
- }
- return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
- SDValue& BaseReg, SDValue &Offset) {
- if (!isa<ConstantSDNode>(Addr)) {
- BaseReg = Addr;
- Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
- return true;
- }
- return false;
-}
-
bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
SDValue &Offset) {
return false;
@@ -684,11 +672,11 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
SDLoc DL(Addr);
if ((C = dyn_cast<ConstantSDNode>(Addr))) {
- Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
(C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
- Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
(C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
@@ -759,12 +747,11 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
if (ProduceCarry) {
// Replace the carry-use
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1));
+ ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
}
// Replace the remaining uses.
- CurDAG->ReplaceAllUsesWith(N, RegSequence);
- CurDAG->RemoveDeadNode(N);
+ ReplaceNode(N, RegSequence);
}
void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
@@ -1410,7 +1397,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
return false;
SDLoc SL(ByteOffsetNode);
- AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration();
+ GCNSubtarget::Generation Gen = Subtarget->getGeneration();
int64_t ByteOffset = C->getSExtValue();
int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset);
@@ -1435,19 +1422,45 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
return true;
}
+SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
+ if (Addr.getValueType() != MVT::i32)
+ return Addr;
+
+ // Zero-extend a 32-bit address.
+ SDLoc SL(Addr);
+
+ const MachineFunction &MF = CurDAG->getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned AddrHiVal = Info->get32BitAddressHighBits();
+ SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
+
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
+ Addr,
+ CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+ SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
+ 0),
+ CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
+ };
+
+ return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
+ Ops), 0);
+}
+
bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
SDValue &Offset, bool &Imm) const {
SDLoc SL(Addr);
+
if (CurDAG->isBaseWithConstantOffset(Addr)) {
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
if (SelectSMRDOffset(N1, Offset, Imm)) {
- SBase = N0;
+ SBase = Expand32BitAddress(N0);
return true;
}
}
- SBase = Addr;
+ SBase = Expand32BitAddress(Addr);
Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
Imm = true;
return true;
@@ -1651,7 +1664,7 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
return true;
if (VT == MVT::i64) {
- auto ST = static_cast<const SISubtarget *>(Subtarget);
+ auto ST = static_cast<const GCNSubtarget *>(Subtarget);
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
@@ -1674,15 +1687,39 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC;
SDLoc SL(N);
+ if (!UseSCCBr) {
+ // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
+ // analyzed what generates the vcc value, so we do not know whether vcc
+ // bits for disabled lanes are 0. Thus we need to mask out bits for
+ // disabled lanes.
+ //
+ // For the case that we select S_CBRANCH_SCC1 and it gets
+ // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
+ // SIInstrInfo::moveToVALU which inserts the S_AND).
+ //
+ // We could add an analysis of what generates the vcc value here and omit
+ // the S_AND when is unnecessary. But it would be better to add a separate
+ // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
+ // catches both cases.
+ Cond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1,
+ CurDAG->getRegister(AMDGPU::EXEC, MVT::i1),
+ Cond),
+ 0);
+ }
+
SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
N->getOperand(2), // Basic Block
VCC.getValue(0));
}
-void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
MVT VT = N->getSimpleValueType(0);
- if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) {
+ bool IsFMA = N->getOpcode() == ISD::FMA;
+ if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
+ !Subtarget->hasFmaMixInsts()) ||
+ ((IsFMA && Subtarget->hasMadMixInsts()) ||
+ (!IsFMA && Subtarget->hasFmaMixInsts()))) {
SelectCode(N);
return;
}
@@ -1692,13 +1729,13 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
SDValue Src2 = N->getOperand(2);
unsigned Src0Mods, Src1Mods, Src2Mods;
- // Avoid using v_mad_mix_f32 unless there is actually an operand using the
- // conversion from f16.
+ // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
+ // using the conversion from f16.
bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
- assert(!Subtarget->hasFP32Denormals() &&
+ assert((IsFMA || !Subtarget->hasFP32Denormals()) &&
"fmad selected with denormals enabled");
// TODO: We can select this with f32 denormals enabled if all the sources are
// converted from f16 (in which case fmad isn't legal).
@@ -1714,7 +1751,9 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
Zero, Zero
};
- CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops);
+ CurDAG->SelectNodeTo(N,
+ IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
+ MVT::f32, Ops);
} else {
SelectCode(N);
}
@@ -2100,6 +2139,41 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
} while (IsModified);
}
+bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &MF.getSubtarget<R600Subtarget>();
+ return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
+ if (!N->readMem())
+ return false;
+ if (CbId == -1)
+ return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+ N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
+
+ return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
+}
+
+bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+ SDValue& IntPtr) {
+ if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+ IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
+ true);
+ return true;
+ }
+ return false;
+}
+
+bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+ SDValue& BaseReg, SDValue &Offset) {
+ if (!isa<ConstantSDNode>(Addr)) {
+ BaseReg = Addr;
+ Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
+ return true;
+ }
+ return false;
+}
+
void R600DAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
@@ -2120,12 +2194,12 @@ void R600DAGToDAGISel::Select(SDNode *N) {
// pass. We want to avoid 128 bits copies as much as possible because they
// can't be bundled by our scheduler.
switch(NumVectorElts) {
- case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+ case 2: RegClassID = R600::R600_Reg64RegClassID; break;
case 4:
if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
- RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+ RegClassID = R600::R600_Reg128VerticalRegClassID;
else
- RegClassID = AMDGPU::R600_Reg128RegClassID;
+ RegClassID = R600::R600_Reg128RegClassID;
break;
default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
}
@@ -2143,11 +2217,11 @@ bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
SDLoc DL(Addr);
if ((C = dyn_cast<ConstantSDNode>(Addr))) {
- Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
(C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
- Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
(C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
@@ -2178,7 +2252,7 @@ bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
&& isInt<16>(IMMOffset->getZExtValue())) {
Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
SDLoc(CurDAG->getEntryNode()),
- AMDGPU::ZERO, MVT::i32);
+ R600::ZERO, MVT::i32);
Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
MVT::i32);
return true;
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 49929441ef21..b201126c593b 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This is the parent TargetLowering class for hardware code gen
+/// This is the parent TargetLowering class for hardware code gen
/// targets.
//
//===----------------------------------------------------------------------===//
@@ -25,9 +25,12 @@
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "R600MachineFunctionInfo.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -38,18 +41,6 @@
#include "llvm/Support/KnownBits.h"
using namespace llvm;
-static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- MachineFunction &MF = State.getMachineFunction();
- AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
-
- uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
- ArgFlags.getOrigAlign());
- State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return true;
-}
-
static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State,
@@ -71,7 +62,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
case MVT::i64:
case MVT::f64:
case MVT::v2i32:
- case MVT::v2f32: {
+ case MVT::v2f32:
+ case MVT::v4i16:
+ case MVT::v4f16: {
// Up to SGPR0-SGPR39
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
&AMDGPU::SGPR_64RegClass, 20);
@@ -92,7 +85,9 @@ static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
case MVT::i64:
case MVT::f64:
case MVT::v2i32:
- case MVT::v2f32: {
+ case MVT::v2f32:
+ case MVT::v4i16:
+ case MVT::v4f16: {
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
&AMDGPU::VReg_64RegClass, 31);
}
@@ -324,10 +319,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FLOG, MVT::f32, Custom);
setOperationAction(ISD::FLOG10, MVT::f32, Custom);
- if (Subtarget->has16BitInsts()) {
- setOperationAction(ISD::FLOG, MVT::f16, Custom);
- setOperationAction(ISD::FLOG10, MVT::f16, Custom);
- }
setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
@@ -335,10 +326,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FREM, MVT::f32, Custom);
setOperationAction(ISD::FREM, MVT::f64, Custom);
- // v_mad_f32 does not support denormals according to some sources.
- if (!Subtarget->hasFP32Denormals())
- setOperationAction(ISD::FMAD, MVT::f32, Legal);
-
// Expand to fneg + fadd.
setOperationAction(ISD::FSUB, MVT::f64, Expand);
@@ -353,19 +340,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
- if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
- setOperationAction(ISD::FCEIL, MVT::f64, Custom);
- setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
- setOperationAction(ISD::FRINT, MVT::f64, Custom);
- setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
- }
-
- if (!Subtarget->hasBFI()) {
- // fcopysign can be done in a single instruction with BFI.
- setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
- }
-
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
@@ -389,13 +363,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
- }
-
- if (!Subtarget->hasBCNT(32))
- setOperationAction(ISD::CTPOP, MVT::i32, Expand);
- if (!Subtarget->hasBCNT(64))
- setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+ // AMDGPU uses ADDC/SUBC/ADDE/SUBE
+ setOperationAction(ISD::ADDC, VT, Legal);
+ setOperationAction(ISD::SUBC, VT, Legal);
+ setOperationAction(ISD::ADDE, VT, Legal);
+ setOperationAction(ISD::SUBE, VT, Legal);
+ }
// The hardware supports 32-bit ROTR, but not ROTL.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
@@ -416,28 +390,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SMAX, MVT::i32, Legal);
setOperationAction(ISD::UMAX, MVT::i32, Legal);
- if (Subtarget->hasFFBH())
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
-
- if (Subtarget->hasFFBL())
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
-
setOperationAction(ISD::CTTZ, MVT::i64, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
setOperationAction(ISD::CTLZ, MVT::i64, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
- // We only really have 32-bit BFE instructions (and 16-bit on VI).
- //
- // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
- // effort to match them now. We want this to be false for i64 cases when the
- // extraction isn't restricted to the upper or lower half. Ideally we would
- // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
- // span the midpoint are probably relatively rare, so don't worry about them
- // for now.
- if (Subtarget->hasBFE())
- setHasExtractBitsInsn(true);
-
static const MVT::SimpleValueType VectorIntTypes[] = {
MVT::v2i32, MVT::v4i32
};
@@ -468,10 +425,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Custom);
setOperationAction(ISD::UDIVREM, VT, Expand);
- setOperationAction(ISD::ADDC, VT, Expand);
- setOperationAction(ISD::SUBC, VT, Expand);
- setOperationAction(ISD::ADDE, VT, Expand);
- setOperationAction(ISD::SUBE, VT, Expand);
setOperationAction(ISD::SELECT, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
@@ -546,11 +499,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// vector compares until that is fixed.
setHasMultipleConditionRegisters(true);
- // SI at least has hardware support for floating point exceptions, but no way
- // of using or handling them is implemented. They are also optional in OpenCL
- // (Section 7.3)
- setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
-
PredictableSelectIsExpensive = false;
// We want to find all load dependencies for long chains of stores to enable
@@ -573,6 +521,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::MULHU);
setTargetDAGCombine(ISD::MULHS);
@@ -607,6 +556,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
case ISD::FNEARBYINT:
case AMDGPUISD::RCP:
case AMDGPUISD::RCP_LEGACY:
+ case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::SIN_HW:
case AMDGPUISD::FMUL_LEGACY:
case AMDGPUISD::FMIN_LEGACY:
@@ -748,6 +698,37 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
return true;
}
+bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case ISD::EntryToken:
+ case ISD::TokenFactor:
+ return true;
+ case ISD::INTRINSIC_WO_CHAIN:
+ {
+ unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IntrID) {
+ default:
+ return false;
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane:
+ return true;
+ }
+ }
+ break;
+ case ISD::LOAD:
+ {
+ const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
+ if (L->getMemOperand()->getAddrSpace()
+ == AMDGPUASI.CONSTANT_ADDRESS_32BIT)
+ return true;
+ return false;
+ }
+ break;
+ }
+}
+
//===---------------------------------------------------------------------===//
// Target Properties
//===---------------------------------------------------------------------===//
@@ -832,17 +813,6 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return isZExtFree(Val.getValueType(), VT2);
}
-// v_mad_mix* support a conversion from f16 to f32.
-//
-// There is only one special case when denormals are enabled we don't currently,
-// where this is OK to use.
-bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,
- EVT DestVT, EVT SrcVT) const {
- return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
- DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
- SrcVT.getScalarType() == MVT::f16;
-}
-
bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
// limited number of native 64-bit operations. Shrinking an operation to fit
@@ -862,7 +832,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
switch (CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
- return CC_AMDGPU_Kernel;
+ llvm_unreachable("kernels should not be handled here");
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
@@ -885,7 +855,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
switch (CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
- return CC_AMDGPU_Kernel;
+ llvm_unreachable("kernels should not be handled here");
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
@@ -929,74 +899,118 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
/// for each individual part is i8. We pass the memory type as LocVT to the
/// calling convention analysis function and the register type (Ins[x].VT) as
/// the ValVT.
-void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
- const SmallVectorImpl<ISD::InputArg> &Ins) const {
- for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
- const ISD::InputArg &In = Ins[i];
- EVT MemVT;
-
- unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
-
- if (!Subtarget->isAmdHsaOS() &&
- (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
- // The ABI says the caller will extend these values to 32-bits.
- MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
- } else if (NumRegs == 1) {
- // This argument is not split, so the IR type is the memory type.
- assert(!In.Flags.isSplit());
- if (In.ArgVT.isExtended()) {
- // We have an extended type, like i24, so we should just use the register type
- MemVT = In.VT;
- } else {
- MemVT = In.ArgVT;
- }
- } else if (In.ArgVT.isVector() && In.VT.isVector() &&
- In.ArgVT.getScalarType() == In.VT.getScalarType()) {
- assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
- // We have a vector value which has been split into a vector with
- // the same scalar type, but fewer elements. This should handle
- // all the floating-point vector types.
- MemVT = In.VT;
- } else if (In.ArgVT.isVector() &&
- In.ArgVT.getVectorNumElements() == NumRegs) {
- // This arg has been split so that each element is stored in a separate
- // register.
- MemVT = In.ArgVT.getScalarType();
- } else if (In.ArgVT.isExtended()) {
- // We have an extended type, like i65.
- MemVT = In.VT;
- } else {
- unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
- assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
- if (In.VT.isInteger()) {
- MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
- } else if (In.VT.isVector()) {
- assert(!In.VT.getScalarType().isFloatingPoint());
- unsigned NumElements = In.VT.getVectorNumElements();
- assert(MemoryBits % NumElements == 0);
- // This vector type has been split into another vector type with
- // a different elements size.
- EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
- MemoryBits / NumElements);
- MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
+ CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const {
+ const MachineFunction &MF = State.getMachineFunction();
+ const Function &Fn = MF.getFunction();
+ LLVMContext &Ctx = Fn.getParent()->getContext();
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
+ const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
+
+ unsigned MaxAlign = 1;
+ uint64_t ExplicitArgOffset = 0;
+ const DataLayout &DL = Fn.getParent()->getDataLayout();
+
+ unsigned InIndex = 0;
+
+ for (const Argument &Arg : Fn.args()) {
+ Type *BaseArgTy = Arg.getType();
+ unsigned Align = DL.getABITypeAlignment(BaseArgTy);
+ MaxAlign = std::max(Align, MaxAlign);
+ unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
+
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+
+ // We're basically throwing away everything passed into us and starting over
+ // to get accurate in-memory offsets. The "PartOffset" is completely useless
+ // to us as computed in Ins.
+ //
+ // We also need to figure out what type legalization is trying to do to get
+ // the correct memory offsets.
+
+ SmallVector<EVT, 16> ValueVTs;
+ SmallVector<uint64_t, 16> Offsets;
+ ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
+
+ for (unsigned Value = 0, NumValues = ValueVTs.size();
+ Value != NumValues; ++Value) {
+ uint64_t BasePartOffset = Offsets[Value];
+
+ EVT ArgVT = ValueVTs[Value];
+ EVT MemVT = ArgVT;
+ MVT RegisterVT =
+ getRegisterTypeForCallingConv(Ctx, ArgVT);
+ unsigned NumRegs =
+ getNumRegistersForCallingConv(Ctx, ArgVT);
+
+ if (!Subtarget->isAmdHsaOS() &&
+ (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) {
+ // The ABI says the caller will extend these values to 32-bits.
+ MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32;
+ } else if (NumRegs == 1) {
+ // This argument is not split, so the IR type is the memory type.
+ if (ArgVT.isExtended()) {
+ // We have an extended type, like i24, so we should just use the
+ // register type.
+ MemVT = RegisterVT;
+ } else {
+ MemVT = ArgVT;
+ }
+ } else if (ArgVT.isVector() && RegisterVT.isVector() &&
+ ArgVT.getScalarType() == RegisterVT.getScalarType()) {
+ assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
+ // We have a vector value which has been split into a vector with
+ // the same scalar type, but fewer elements. This should handle
+ // all the floating-point vector types.
+ MemVT = RegisterVT;
+ } else if (ArgVT.isVector() &&
+ ArgVT.getVectorNumElements() == NumRegs) {
+ // This arg has been split so that each element is stored in a separate
+ // register.
+ MemVT = ArgVT.getScalarType();
+ } else if (ArgVT.isExtended()) {
+ // We have an extended type, like i65.
+ MemVT = RegisterVT;
} else {
- llvm_unreachable("cannot deduce memory type.");
+ unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
+ assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
+ if (RegisterVT.isInteger()) {
+ MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
+ } else if (RegisterVT.isVector()) {
+ assert(!RegisterVT.getScalarType().isFloatingPoint());
+ unsigned NumElements = RegisterVT.getVectorNumElements();
+ assert(MemoryBits % NumElements == 0);
+ // This vector type has been split into another vector type with
+ // a different elements size.
+ EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
+ MemoryBits / NumElements);
+ MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+ } else {
+ llvm_unreachable("cannot deduce memory type.");
+ }
}
- }
- // Convert one element vectors to scalar.
- if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
- MemVT = MemVT.getScalarType();
+ // Convert one element vectors to scalar.
+ if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
+ MemVT = MemVT.getScalarType();
- if (MemVT.isExtended()) {
- // This should really only happen if we have vec3 arguments
- assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
- MemVT = MemVT.getPow2VectorType(State.getContext());
- }
+ if (MemVT.isExtended()) {
+ // This should really only happen if we have vec3 arguments
+ assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
+ MemVT = MemVT.getPow2VectorType(State.getContext());
+ }
- assert(MemVT.isSimple());
- allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
- State);
+ unsigned PartOffset = 0;
+ for (unsigned i = 0; i != NumRegs; ++i) {
+ State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
+ BasePartOffset + PartOffset,
+ MemVT.getSimpleVT(),
+ CCValAssign::Full));
+ PartOffset += MemVT.getStoreSize();
+ }
+ }
}
}
@@ -1178,7 +1192,15 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = G->getGlobal();
- if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
+ if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
+ G->getAddressSpace() == AMDGPUASI.REGION_ADDRESS) {
+ if (!MFI->isEntryFunction()) {
+ const Function &Fn = DAG.getMachineFunction().getFunction();
+ DiagnosticInfoUnsupported BadLDSDecl(
+ Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
+ DAG.getContext()->diagnose(BadLDSDecl);
+ }
+
// XXX: What does the value of G->getOffset() mean?
assert(G->getOffset() == 0 &&
"Do not know what to do with an non-zero offset");
@@ -1201,6 +1223,16 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
SelectionDAG &DAG) const {
SmallVector<SDValue, 8> Args;
+ EVT VT = Op.getValueType();
+ if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SDLoc SL(Op);
+ SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
+ SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
+
+ SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
+ return DAG.getNode(ISD::BITCAST, SL, VT, BV);
+ }
+
for (const SDUse &U : Op->ops())
DAG.ExtractVectorElements(U.get(), Args);
@@ -1219,7 +1251,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
}
-/// \brief Generate Min/Max node
+/// Generate Min/Max node
SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
SDValue LHS, SDValue RHS,
SDValue True, SDValue False,
@@ -1985,7 +2017,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
- // Extend back to to 64-bits.
+ // Extend back to 64-bits.
SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
@@ -2806,28 +2838,6 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
SN->getBasePtr(), SN->getMemOperand());
}
-SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
- ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
- if (!CSrc)
- return SDValue();
-
- const APFloat &F = CSrc->getValueAPF();
- APFloat Zero = APFloat::getZero(F.getSemantics());
- APFloat::cmpResult Cmp0 = F.compare(Zero);
- if (Cmp0 == APFloat::cmpLessThan ||
- (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
- return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
- }
-
- APFloat One(F.getSemantics(), "1.0");
- APFloat::cmpResult Cmp1 = F.compare(One);
- if (Cmp1 == APFloat::cmpGreaterThan)
- return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
-
- return SDValue(CSrc, 0);
-}
-
// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
// issues.
@@ -2903,7 +2913,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
SDValue X = LHS->getOperand(0);
if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
- isTypeLegal(MVT::v2i16)) {
+ isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
// Prefer build_vector as the canonical form if packed types are legal.
// (shl ([asz]ext i16:x), 16 -> build_vector 0, x
SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
@@ -3017,6 +3027,92 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
}
+SDValue AMDGPUTargetLowering::performTruncateCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SDLoc SL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+
+ // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
+ if (Src.getOpcode() == ISD::BITCAST) {
+ SDValue Vec = Src.getOperand(0);
+ if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
+ SDValue Elt0 = Vec.getOperand(0);
+ EVT EltVT = Elt0.getValueType();
+ if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
+ if (EltVT.isFloatingPoint()) {
+ Elt0 = DAG.getNode(ISD::BITCAST, SL,
+ EltVT.changeTypeToInteger(), Elt0);
+ }
+
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
+ }
+ }
+ }
+
+ // Equivalent of above for accessing the high element of a vector as an
+ // integer operation.
+ // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
+ if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
+ if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
+ if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
+ SDValue BV = stripBitcast(Src.getOperand(0));
+ if (BV.getOpcode() == ISD::BUILD_VECTOR &&
+ BV.getValueType().getVectorNumElements() == 2) {
+ SDValue SrcElt = BV.getOperand(1);
+ EVT SrcEltVT = SrcElt.getValueType();
+ if (SrcEltVT.isFloatingPoint()) {
+ SrcElt = DAG.getNode(ISD::BITCAST, SL,
+ SrcEltVT.changeTypeToInteger(), SrcElt);
+ }
+
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
+ }
+ }
+ }
+ }
+
+ // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
+ //
+ // i16 (trunc (srl i64:x, K)), K <= 16 ->
+ // i16 (trunc (srl (i32 (trunc x), K)))
+ if (VT.getScalarSizeInBits() < 32) {
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.getScalarSizeInBits() > 32 &&
+ (Src.getOpcode() == ISD::SRL ||
+ Src.getOpcode() == ISD::SRA ||
+ Src.getOpcode() == ISD::SHL)) {
+ SDValue Amt = Src.getOperand(1);
+ KnownBits Known;
+ DAG.computeKnownBits(Amt, Known);
+ unsigned Size = VT.getScalarSizeInBits();
+ if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
+ (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
+ EVT MidVT = VT.isVector() ?
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ VT.getVectorNumElements()) : MVT::i32;
+
+ EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
+ Src.getOperand(0));
+ DCI.AddToWorklist(Trunc.getNode());
+
+ if (Amt.getValueType() != NewShiftVT) {
+ Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
+ DCI.AddToWorklist(Amt.getNode());
+ }
+
+ SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
+ Trunc, Amt);
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
// We need to specifically handle i64 mul here to avoid unnecessary conversion
// instructions. If we only match on the legalized i64 mul expansion,
// SimplifyDemandedBits will be unable to remove them because there will be
@@ -3058,6 +3154,17 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+
+ // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
+ // in the source into any_extends if the result of the mul is truncated. Since
+ // we can assume the high bits are whatever we want, use the underlying value
+ // to avoid the unknown high bits from interfering.
+ if (N0.getOpcode() == ISD::ANY_EXTEND)
+ N0 = N0.getOperand(0);
+
+ if (N1.getOpcode() == ISD::ANY_EXTEND)
+ N1 = N1.getOperand(0);
+
SDValue Mul;
if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
@@ -3495,6 +3602,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
case ISD::FSIN:
case AMDGPUISD::RCP:
case AMDGPUISD::RCP_LEGACY:
+ case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::SIN_HW: {
SDValue CvtSrc = N0.getOperand(0);
if (CvtSrc.getOpcode() == ISD::FNEG) {
@@ -3571,6 +3679,18 @@ SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
}
}
+SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+ if (!CFP)
+ return SDValue();
+
+ // XXX - Should this flush denormals?
+ const APFloat &Val = CFP->getValueAPF();
+ APFloat One(Val.getSemantics(), "1.0");
+ return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
+}
+
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -3617,12 +3737,13 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
// TODO: Generalize and move to DAGCombiner
SDValue Src = N->getOperand(0);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
- assert(Src.getValueType() == MVT::i64);
- SDLoc SL(N);
- uint64_t CVal = C->getZExtValue();
- return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
- DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
- DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+ if (Src.getValueType() == MVT::i64) {
+ SDLoc SL(N);
+ uint64_t CVal = C->getZExtValue();
+ return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
+ DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+ DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+ }
}
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3656,6 +3777,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performSraCombine(N, DCI);
}
+ case ISD::TRUNCATE:
+ return performTruncateCombine(N, DCI);
case ISD::MUL:
return performMulCombine(N, DCI);
case ISD::MULHS:
@@ -3768,18 +3891,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performLoadCombine(N, DCI);
case ISD::STORE:
return performStoreCombine(N, DCI);
- case AMDGPUISD::CLAMP:
- return performClampCombine(N, DCI);
- case AMDGPUISD::RCP: {
- if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
- // XXX - Should this flush denormals?
- const APFloat &Val = CFP->getValueAPF();
- APFloat One(Val.getSemantics(), "1.0");
- return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
- }
-
- break;
- }
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RCP_IFLAG:
+ return performRcpCombine(N, DCI);
case ISD::AssertZext:
case ISD::AssertSext:
return performAssertSZExtCombine(N, DCI);
@@ -3856,9 +3970,14 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
}
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
- const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
- unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
- uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
+ const MachineFunction &MF, const ImplicitParameter Param) const {
+ const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+ const AMDGPUSubtarget &ST =
+ AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
+ unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
+ unsigned Alignment = ST.getAlignmentForImplicitArgPtr();
+ uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
+ ExplicitArgOffset;
switch (Param) {
case GRID_DIM:
return ArgOffset;
@@ -3907,6 +4026,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FMED3)
NODE_NAME_CASE(SMED3)
NODE_NAME_CASE(UMED3)
+ NODE_NAME_CASE(FDOT2)
NODE_NAME_CASE(URECIP)
NODE_NAME_CASE(DIV_SCALE)
NODE_NAME_CASE(DIV_FMAS)
@@ -3917,6 +4037,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RSQ)
NODE_NAME_CASE(RCP_LEGACY)
NODE_NAME_CASE(RSQ_LEGACY)
+ NODE_NAME_CASE(RCP_IFLAG)
NODE_NAME_CASE(FMUL_LEGACY)
NODE_NAME_CASE(RSQ_CLAMP)
NODE_NAME_CASE(LDEXP)
@@ -3941,6 +4062,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(MAD_I24)
NODE_NAME_CASE(MAD_I64_I32)
NODE_NAME_CASE(MAD_U64_U32)
+ NODE_NAME_CASE(PERM)
NODE_NAME_CASE(TEXTURE_FETCH)
NODE_NAME_CASE(EXPORT)
NODE_NAME_CASE(EXPORT_DONE)
@@ -3957,6 +4079,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_F32_UBYTE2)
NODE_NAME_CASE(CVT_F32_UBYTE3)
NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
+ NODE_NAME_CASE(CVT_PKNORM_I16_F32)
+ NODE_NAME_CASE(CVT_PKNORM_U16_F32)
+ NODE_NAME_CASE(CVT_PK_I16_I32)
+ NODE_NAME_CASE(CVT_PK_U16_U32)
NODE_NAME_CASE(FP_TO_FP16)
NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
@@ -3976,14 +4102,21 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(LOAD_CONSTANT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
+ NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
+ NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
NODE_NAME_CASE(ATOMIC_INC)
NODE_NAME_CASE(ATOMIC_DEC)
+ NODE_NAME_CASE(ATOMIC_LOAD_FADD)
+ NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
+ NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
+ NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(BUFFER_STORE)
NODE_NAME_CASE(BUFFER_STORE_FORMAT)
+ NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
@@ -3995,6 +4128,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_OR)
NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
+
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
return nullptr;
@@ -4108,14 +4242,45 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
Known.Zero.setHighBits(32 - MaxValBits);
break;
}
+ case AMDGPUISD::PERM: {
+ ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ if (!CMask)
+ return;
+
+ KnownBits LHSKnown, RHSKnown;
+ DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
+ DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
+ unsigned Sel = CMask->getZExtValue();
+
+ for (unsigned I = 0; I < 32; I += 8) {
+ unsigned SelBits = Sel & 0xff;
+ if (SelBits < 4) {
+ SelBits *= 8;
+ Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
+ Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
+ } else if (SelBits < 7) {
+ SelBits = (SelBits & 3) * 8;
+ Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
+ Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
+ } else if (SelBits == 0x0c) {
+ Known.Zero |= 0xff << I;
+ } else if (SelBits > 0x0c) {
+ Known.One |= 0xff << I;
+ }
+ Sel >>= 8;
+ }
+ break;
+ }
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IID) {
case Intrinsic::amdgcn_mbcnt_lo:
case Intrinsic::amdgcn_mbcnt_hi: {
+ const GCNSubtarget &ST =
+ DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
// These return at most the wavefront size - 1.
unsigned Size = Op.getValueType().getSizeInBits();
- Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2());
+ Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
break;
}
default:
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 5c31bddd9b1a..a4c3b413e103 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface definition of the TargetLowering class that is common
+/// Interface definition of the TargetLowering class that is common
/// to all AMD GPUs.
//
//===----------------------------------------------------------------------===//
@@ -28,6 +28,8 @@ struct ArgDescriptor;
class AMDGPUTargetLowering : public TargetLowering {
private:
+ const AMDGPUSubtarget *Subtarget;
+
/// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been
/// legalized from a smaller type VT. Need to match pre-legalized type because
/// the generic legalization inserts the add/sub between the select and
@@ -39,12 +41,11 @@ public:
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
protected:
- const AMDGPUSubtarget *Subtarget;
AMDGPUAS AMDGPUASI;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
- /// \brief Split a vector store into multiple scalar stores.
+ /// Split a vector store into multiple scalar stores.
/// \returns The resulting chain.
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
@@ -78,7 +79,6 @@ protected:
bool shouldCombineMemoryType(EVT VT) const;
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
@@ -87,6 +87,7 @@ protected:
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -96,6 +97,7 @@ protected:
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
@@ -108,10 +110,10 @@ protected:
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
- /// \brief Split a vector load into 2 loads of half the vector.
+ /// Split a vector load into 2 loads of half the vector.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
- /// \brief Split a vector store into 2 stores of half the vector.
+ /// Split a vector store into 2 stores of half the vector.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
@@ -120,8 +122,11 @@ protected:
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results) const;
- void analyzeFormalArgumentsCompute(CCState &State,
- const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
+ void analyzeFormalArgumentsCompute(
+ CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
public:
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
@@ -136,6 +141,10 @@ public:
return false;
}
+ static inline SDValue stripBitcast(SDValue Val) {
+ return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
+ }
+
static bool allUsesHaveSourceMods(const SDNode *N,
unsigned CostThreshold = 4);
bool isFAbsFree(EVT VT) const override;
@@ -146,7 +155,6 @@ public:
bool isZExtFree(Type *Src, Type *Dest) const override;
bool isZExtFree(EVT Src, EVT Dest) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
- bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override;
bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
@@ -168,6 +176,7 @@ public:
bool isCheapToSpeculateCttz() const override;
bool isCheapToSpeculateCtlz() const override;
+ bool isSDNodeAlwaysUniform(const SDNode *N) const override;
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
@@ -224,7 +233,7 @@ public:
virtual SDNode *PostISelFolding(MachineSDNode *N,
SelectionDAG &DAG) const = 0;
- /// \brief Determine which of the bits specified in \p Mask are known to be
+ /// Determine which of the bits specified in \p Mask are known to be
/// either zero or one and return them in the \p KnownZero and \p KnownOne
/// bitsets.
void computeKnownBitsForTargetNode(const SDValue Op,
@@ -237,7 +246,7 @@ public:
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
- /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
+ /// Helper function that adds Reg to the LiveIn list of the DAG's
/// MachineFunction.
///
/// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise
@@ -285,9 +294,9 @@ public:
GRID_OFFSET,
};
- /// \brief Helper function that returns the byte offset of the given
+ /// Helper function that returns the byte offset of the given
/// type of implicit parameter.
- uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
+ uint32_t getImplicitParameterOffset(const MachineFunction &MF,
const ImplicitParameter Param) const;
AMDGPUAS getAMDGPUAS() const {
@@ -357,6 +366,7 @@ enum NodeType : unsigned {
FMED3,
SMED3,
UMED3,
+ FDOT2,
URECIP,
DIV_SCALE,
DIV_FMAS,
@@ -372,6 +382,7 @@ enum NodeType : unsigned {
RSQ,
RCP_LEGACY,
RSQ_LEGACY,
+ RCP_IFLAG,
FMUL_LEGACY,
RSQ_CLAMP,
LDEXP,
@@ -396,6 +407,7 @@ enum NodeType : unsigned {
MAD_I64_I32,
MUL_LOHI_I24,
MUL_LOHI_U24,
+ PERM,
TEXTURE_FETCH,
EXPORT, // exp on SI+
EXPORT_DONE, // exp on SI+ with done bit set
@@ -417,6 +429,10 @@ enum NodeType : unsigned {
// Convert two float 32 numbers into a single register holding two packed f16
// with round to zero.
CVT_PKRTZ_F16_F32,
+ CVT_PKNORM_I16_F32,
+ CVT_PKNORM_U16_F32,
+ CVT_PK_I16_I32,
+ CVT_PK_U16_U32,
// Same as the standard node, except the high bits of the resulting integer
// are known 0.
@@ -451,14 +467,21 @@ enum NodeType : unsigned {
LOAD_CONSTANT,
TBUFFER_STORE_FORMAT,
TBUFFER_STORE_FORMAT_X3,
+ TBUFFER_STORE_FORMAT_D16,
TBUFFER_LOAD_FORMAT,
+ TBUFFER_LOAD_FORMAT_D16,
ATOMIC_CMP_SWAP,
ATOMIC_INC,
ATOMIC_DEC,
+ ATOMIC_LOAD_FADD,
+ ATOMIC_LOAD_FMIN,
+ ATOMIC_LOAD_FMAX,
BUFFER_LOAD,
BUFFER_LOAD_FORMAT,
+ BUFFER_LOAD_FORMAT_D16,
BUFFER_STORE,
BUFFER_STORE_FORMAT,
+ BUFFER_STORE_FORMAT_D16,
BUFFER_ATOMIC_SWAP,
BUFFER_ATOMIC_ADD,
BUFFER_ATOMIC_SUB,
@@ -470,6 +493,7 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_OR,
BUFFER_ATOMIC_XOR,
BUFFER_ATOMIC_CMPSWAP,
+
LAST_AMDGPU_ISD_NUMBER
};
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
index ff9e7b50ed5c..35dd9eb0a478 100644
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This is AMDGPU specific replacement of the standard inliner.
+/// This is AMDGPU specific replacement of the standard inliner.
/// The main purpose is to account for the fact that calls not only expensive
/// on the AMDGPU, but much more expensive if a private memory pointer is
/// passed to a function as an argument. In this situation, we are unable to
@@ -161,8 +161,8 @@ static bool isWrapperOnlyCall(CallSite CS) {
return false;
}
if (isa<ReturnInst>(*std::next(I->getIterator()))) {
- DEBUG(dbgs() << " Wrapper only call detected: "
- << Callee->getName() << '\n');
+ LLVM_DEBUG(dbgs() << " Wrapper only call detected: "
+ << Callee->getName() << '\n');
return true;
}
}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 8156599528c2..07aa7c2cc8ad 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -16,95 +16,36 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
using namespace llvm;
-#define GET_INSTRINFO_CTOR_DTOR
-#include "AMDGPUGenInstrInfo.inc"
-
// Pin the vtable to this file.
-void AMDGPUInstrInfo::anchor() {}
-
-AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
- : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
- ST(ST),
- AMDGPUASI(ST.getAMDGPUAS()) {}
-
-// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
-// the first 16 loads will be interleaved with the stores, and the next 16 will
-// be clustered as expected. It should really split into 2 16 store batches.
-//
-// Loads are clustered until this returns false, rather than trying to schedule
-// groups of stores. This also means we have to deal with saying different
-// address space loads should be clustered, and ones which might cause bank
-// conflicts.
-//
-// This might be deprecated so it might not be worth that much effort to fix.
-bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
- int64_t Offset0, int64_t Offset1,
- unsigned NumLoads) const {
- assert(Offset1 > Offset0 &&
- "Second offset should be larger than first offset!");
- // If we have less than 16 loads in a row, and the offsets are within 64
- // bytes, then schedule together.
-
- // A cacheline is 64 bytes (for global memory).
- return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
-}
-
-// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
-enum SIEncodingFamily {
- SI = 0,
- VI = 1,
- SDWA = 2,
- SDWA9 = 3,
- GFX9 = 4
-};
-
-static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
- switch (ST.getGeneration()) {
- case AMDGPUSubtarget::SOUTHERN_ISLANDS:
- case AMDGPUSubtarget::SEA_ISLANDS:
- return SIEncodingFamily::SI;
- case AMDGPUSubtarget::VOLCANIC_ISLANDS:
- case AMDGPUSubtarget::GFX9:
- return SIEncodingFamily::VI;
-
- // FIXME: This should never be called for r600 GPUs.
- case AMDGPUSubtarget::R600:
- case AMDGPUSubtarget::R700:
- case AMDGPUSubtarget::EVERGREEN:
- case AMDGPUSubtarget::NORTHERN_ISLANDS:
- return SIEncodingFamily::SI;
- }
-
- llvm_unreachable("Unknown subtarget generation!");
-}
-
-int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
- SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+//void AMDGPUInstrInfo::anchor() {}
- if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
- ST.getGeneration() >= AMDGPUSubtarget::GFX9)
- Gen = SIEncodingFamily::GFX9;
+AMDGPUInstrInfo::AMDGPUInstrInfo(const GCNSubtarget &ST) { }
- if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
- Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
- : SIEncodingFamily::SDWA;
- int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
+bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
+ const Value *Ptr = MMO->getValue();
+ // UndefValue means this is a load of a kernel input. These are uniform.
+ // Sometimes LDS instructions have constant pointers.
+ // If Ptr is null, then that means this mem operand contains a
+ // PseudoSourceValue like GOT.
+ if (!Ptr || isa<UndefValue>(Ptr) ||
+ isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+ return true;
- // -1 means that Opcode is already a native instruction.
- if (MCOp == -1)
- return Opcode;
+ if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+ return true;
- // (uint16_t)-1 means that Opcode is a pseudo instruction that has
- // no encoding in the given subtarget generation.
- if (MCOp == (uint16_t)-1)
- return -1;
+ if (const Argument *Arg = dyn_cast<Argument>(Ptr))
+ return AMDGPU::isArgPassedInSGPR(Arg);
- return MCOp;
+ const Instruction *I = dyn_cast<Instruction>(Ptr);
+ return I && I->getMetadata("amdgpu.uniform");
}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index a9fcd4834638..2f8166da0d33 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Contains the definition of a TargetInstrInfo class that is common
+/// Contains the definition of a TargetInstrInfo class that is common
/// to all AMD GPUs.
//
//===----------------------------------------------------------------------===//
@@ -20,37 +20,43 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
-#define GET_INSTRINFO_HEADER
-#include "AMDGPUGenInstrInfo.inc"
-#undef GET_INSTRINFO_HEADER
-
namespace llvm {
-class AMDGPUSubtarget;
+class GCNSubtarget;
class MachineFunction;
class MachineInstr;
class MachineInstrBuilder;
-class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
-private:
- const AMDGPUSubtarget &ST;
+class AMDGPUInstrInfo {
+public:
+ explicit AMDGPUInstrInfo(const GCNSubtarget &st);
- virtual void anchor();
-protected:
- AMDGPUAS AMDGPUASI;
+ static bool isUniformMMO(const MachineMemOperand *MMO);
+};
-public:
- explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
+namespace AMDGPU {
- bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
- int64_t Offset1, int64_t Offset2,
- unsigned NumLoads) const override;
+struct RsrcIntrinsic {
+ unsigned Intr;
+ uint8_t RsrcArg;
+ bool IsImage;
+};
+const RsrcIntrinsic *lookupRsrcIntrinsic(unsigned Intr);
+
+struct D16ImageDimIntrinsic {
+ unsigned Intr;
+ unsigned D16HelperIntr;
+};
+const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);
- /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
- /// Return -1 if the target-specific opcode for the pseudo instruction does
- /// not exist. If Opcode is not a pseudo instruction, this is identity.
- int pseudoToMCOpcode(int Opcode) const;
+struct ImageDimIntrinsicInfo {
+ unsigned Intr;
+ unsigned BaseOpcode;
+ MIMGDim Dim;
};
+const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);
+
+} // end AMDGPU namespace
} // End llvm namespace
#endif
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index c024010f3e96..96b7568eec1f 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -35,6 +35,10 @@ def AMDGPUFPPackOp : SDTypeProfile<1, 2,
[SDTCisFP<1>, SDTCisSameAs<1, 2>]
>;
+def AMDGPUIntPackOp : SDTypeProfile<1, 2,
+ [SDTCisInt<1>, SDTCisSameAs<1, 2>]
+>;
+
def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
[SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
>;
@@ -136,12 +140,18 @@ def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>;
+
// out = 1.0 / sqrt(a) result clamped to +/- max_float.
def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
+def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
+def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
@@ -160,8 +170,6 @@ def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]
>;
-def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
-
// out = min(a, b) a and b are floats, where a nan comparison fails.
def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
[]
@@ -333,6 +341,13 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
+def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
+ SDTCisFP<0>, SDTCisVec<1>]>,
+ []>;
+
+def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+
def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
SDTypeProfile<0, 1, [SDTCisInt<0>]>,
[SDNPHasChain, SDNPInGlue]>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 16d240e96196..219d430fbb39 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -17,6 +17,12 @@
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -30,10 +36,48 @@
using namespace llvm;
+#define GET_GLOBALISEL_IMPL
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+#undef AMDGPUSubtarget
+
AMDGPUInstructionSelector::AMDGPUInstructionSelector(
- const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI)
+ const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
+ const AMDGPUTargetMachine &TM)
: InstructionSelector(), TII(*STI.getInstrInfo()),
- TRI(*STI.getRegisterInfo()), RBI(RBI), AMDGPUASI(STI.getAMDGPUAS()) {}
+ TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
+ STI(STI),
+ EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+ ,AMDGPUASI(STI.getAMDGPUAS())
+{
+}
+
+const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
+
+bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ I.setDesc(TII.get(TargetOpcode::COPY));
+ for (const MachineOperand &MO : I.operands()) {
+ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ continue;
+
+ const TargetRegisterClass *RC =
+ TRI.getConstrainedRegClassForOperand(MO, MRI);
+ if (!RC)
+ continue;
+ RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+ }
+ return true;
+}
MachineOperand
AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
@@ -71,6 +115,10 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
}
}
+static int64_t getConstant(const MachineInstr *MI) {
+ return MI->getOperand(1).getCImm()->getSExtValue();
+}
+
bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
@@ -118,12 +166,144 @@ bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
return selectG_ADD(I);
}
+bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const MachineOperand &MO = I.getOperand(0);
+ const TargetRegisterClass *RC =
+ TRI.getConstrainedRegClassForOperand(MO, MRI);
+ if (RC)
+ RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+ I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I,
+ CodeGenCoverage &CoverageInfo) const {
+ unsigned IntrinsicID = I.getOperand(1).getIntrinsicID();
+
+ switch (IntrinsicID) {
+ default:
+ break;
+ case Intrinsic::maxnum:
+ case Intrinsic::minnum:
+ case Intrinsic::amdgcn_cvt_pkrtz:
+ return selectImpl(I, CoverageInfo);
+
+ case Intrinsic::amdgcn_kernarg_segment_ptr: {
+ MachineFunction *MF = I.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ const ArgDescriptor *InputPtrReg;
+ const TargetRegisterClass *RC;
+ const DebugLoc &DL = I.getDebugLoc();
+
+ std::tie(InputPtrReg, RC)
+ = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+ if (!InputPtrReg)
+ report_fatal_error("missing kernarg segment ptr");
+
+ BuildMI(*I.getParent(), &I, DL, TII.get(AMDGPU::COPY))
+ .add(I.getOperand(0))
+ .addReg(MRI.getLiveInVirtReg(InputPtrReg->getRegister()));
+ I.eraseFromParent();
+ return true;
+ }
+ }
+ return false;
+}
+
+static MachineInstr *
+buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
+ unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3,
+ unsigned VM, bool Compr, unsigned Enabled, bool Done) {
+ const DebugLoc &DL = Insert->getDebugLoc();
+ MachineBasicBlock &BB = *Insert->getParent();
+ unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP;
+ return BuildMI(BB, Insert, DL, TII.get(Opcode))
+ .addImm(Tgt)
+ .addReg(Reg0)
+ .addReg(Reg1)
+ .addReg(Reg2)
+ .addReg(Reg3)
+ .addImm(VM)
+ .addImm(Compr)
+ .addImm(Enabled);
+}
+
+bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
+ MachineInstr &I,
+ CodeGenCoverage &CoverageInfo) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ unsigned IntrinsicID = I.getOperand(0).getIntrinsicID();
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_exp: {
+ int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
+ int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
+ int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg()));
+ int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg()));
+
+ MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(),
+ I.getOperand(4).getReg(),
+ I.getOperand(5).getReg(),
+ I.getOperand(6).getReg(),
+ VM, false, Enabled, Done);
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+ }
+ case Intrinsic::amdgcn_exp_compr: {
+ const DebugLoc &DL = I.getDebugLoc();
+ int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
+ int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
+ unsigned Reg0 = I.getOperand(3).getReg();
+ unsigned Reg1 = I.getOperand(4).getReg();
+ unsigned Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg()));
+ int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg()));
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+ MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM,
+ true, Enabled, Done);
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+ }
+ }
+ return false;
+}
+
bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
DebugLoc DL = I.getDebugLoc();
+ unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
+ unsigned Opcode;
// FIXME: Select store instruction based on address space
- MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(AMDGPU::FLAT_STORE_DWORD))
+ switch (StoreSize) {
+ default:
+ return false;
+ case 32:
+ Opcode = AMDGPU::FLAT_STORE_DWORD;
+ break;
+ case 64:
+ Opcode = AMDGPU::FLAT_STORE_DWORDX2;
+ break;
+ case 96:
+ Opcode = AMDGPU::FLAT_STORE_DWORDX3;
+ break;
+ case 128:
+ Opcode = AMDGPU::FLAT_STORE_DWORDX4;
+ break;
+ }
+
+ MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
.add(I.getOperand(1))
.add(I.getOperand(0))
.addImm(0) // offset
@@ -143,36 +323,67 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineOperand &ImmOp = I.getOperand(1);
+
+ // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
+ if (ImmOp.isFPImm()) {
+ const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
+ ImmOp.ChangeToImmediate(Imm.getZExtValue());
+ } else if (ImmOp.isCImm()) {
+ ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue());
+ }
+
unsigned DstReg = I.getOperand(0).getReg();
- unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+ unsigned Size;
+ bool IsSgpr;
+ const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg());
+ if (RB) {
+ IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
+ Size = MRI.getType(DstReg).getSizeInBits();
+ } else {
+ const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg);
+ IsSgpr = TRI.isSGPRClass(RC);
+ Size = TRI.getRegSizeInBits(*RC);
+ }
+ if (Size != 32 && Size != 64)
+ return false;
+
+ unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
if (Size == 32) {
- I.setDesc(TII.get(AMDGPU::S_MOV_B32));
+ I.setDesc(TII.get(Opcode));
+ I.addImplicitDefUseOperands(*MF);
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
- assert(Size == 64);
-
DebugLoc DL = I.getDebugLoc();
- unsigned LoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- unsigned HiReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- const APInt &Imm = I.getOperand(1).getCImm()->getValue();
+ const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass :
+ &AMDGPU::VGPR_32RegClass;
+ unsigned LoReg = MRI.createVirtualRegister(RC);
+ unsigned HiReg = MRI.createVirtualRegister(RC);
+ const APInt &Imm = APInt(Size, I.getOperand(1).getImm());
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), LoReg)
+ BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
.addImm(Imm.trunc(32).getZExtValue());
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
+ BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
.addImm(Imm.ashr(32).getZExtValue());
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
- .addReg(LoReg)
- .addImm(AMDGPU::sub0)
- .addReg(HiReg)
- .addImm(AMDGPU::sub1);
+ const MachineInstr *RS =
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+ .addReg(LoReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(HiReg)
+ .addImm(AMDGPU::sub1);
+
// We can't call constrainSelectedInstRegOperands here, because it doesn't
// work for target independent opcodes
I.eraseFromParent();
- return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
+ const TargetRegisterClass *DstRC =
+ TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI);
+ if (!DstRC)
+ return true;
+ return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
}
static bool isConstant(const MachineInstr &MI) {
@@ -228,6 +439,9 @@ static bool isInstrUniform(const MachineInstr &MI) {
isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
return true;
+ if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+ return true;
+
const Instruction *I = dyn_cast<Instruction>(Ptr);
return I && I->getMetadata("amdgpu.uniform");
}
@@ -292,7 +506,8 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
if (!I.hasOneMemOperand())
return false;
- if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS)
+ if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+ (*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT)
return false;
if (!isInstrUniform(I))
@@ -303,7 +518,7 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
- const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF->getRegInfo();
unsigned DstReg = I.getOperand(0).getReg();
const DebugLoc &DL = I.getDebugLoc();
@@ -405,18 +620,30 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
bool AMDGPUInstructionSelector::select(MachineInstr &I,
CodeGenCoverage &CoverageInfo) const {
- if (!isPreISelGenericOpcode(I.getOpcode()))
+ if (!isPreISelGenericOpcode(I.getOpcode())) {
+ if (I.isCopy())
+ return selectCOPY(I);
return true;
+ }
switch (I.getOpcode()) {
default:
- break;
+ return selectImpl(I, CoverageInfo);
case TargetOpcode::G_ADD:
return selectG_ADD(I);
+ case TargetOpcode::G_BITCAST:
+ return selectCOPY(I);
case TargetOpcode::G_CONSTANT:
+ case TargetOpcode::G_FCONSTANT:
return selectG_CONSTANT(I);
case TargetOpcode::G_GEP:
return selectG_GEP(I);
+ case TargetOpcode::G_IMPLICIT_DEF:
+ return selectG_IMPLICIT_DEF(I);
+ case TargetOpcode::G_INTRINSIC:
+ return selectG_INTRINSIC(I, CoverageInfo);
+ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+ return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo);
case TargetOpcode::G_LOAD:
return selectG_LOAD(I);
case TargetOpcode::G_STORE:
@@ -424,3 +651,47 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
}
return false;
}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
+ }};
+
+}
+
+///
+/// This will select either an SGPR or VGPR operand and will save us from
+/// having to write an extra tablegen pattern.
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src0_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
+ }};
+}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
+ }};
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 715c4882f380..68b40b20aca2 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -15,27 +15,39 @@
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
#include "AMDGPU.h"
+#include "AMDGPUArgumentUsageInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+namespace {
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+#undef AMDGPUSubtarget
+}
+
namespace llvm {
class AMDGPUInstrInfo;
class AMDGPURegisterBankInfo;
+class GCNSubtarget;
class MachineInstr;
class MachineOperand;
class MachineRegisterInfo;
class SIInstrInfo;
+class SIMachineFunctionInfo;
class SIRegisterInfo;
-class SISubtarget;
class AMDGPUInstructionSelector : public InstructionSelector {
public:
- AMDGPUInstructionSelector(const SISubtarget &STI,
- const AMDGPURegisterBankInfo &RBI);
+ AMDGPUInstructionSelector(const GCNSubtarget &STI,
+ const AMDGPURegisterBankInfo &RBI,
+ const AMDGPUTargetMachine &TM);
bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+ static const char *getName();
private:
struct GEPInfo {
@@ -46,10 +58,18 @@ private:
GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
};
+ /// tblgen-erated 'select' implementation.
+ bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+
MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const;
+ bool selectCOPY(MachineInstr &I) const;
bool selectG_CONSTANT(MachineInstr &I) const;
bool selectG_ADD(MachineInstr &I) const;
bool selectG_GEP(MachineInstr &I) const;
+ bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
+ bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+ bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I,
+ CodeGenCoverage &CoverageInfo) const;
bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
SmallVectorImpl<GEPInfo> &AddrInfo) const;
@@ -57,9 +77,35 @@ private:
bool selectG_LOAD(MachineInstr &I) const;
bool selectG_STORE(MachineInstr &I) const;
+ InstructionSelector::ComplexRendererFns
+ selectVCSRC(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectVSRC0(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectVOP3Mods0(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3OMods(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3Mods(MachineOperand &Root) const;
+
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
const AMDGPURegisterBankInfo &RBI;
+ const AMDGPUTargetMachine &TM;
+ const GCNSubtarget &STI;
+ bool EnableLateStructurizeCFG;
+#define GET_GLOBALISEL_PREDICATES_DECL
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+#undef AMDGPUSubtarget
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+
protected:
AMDGPUAS AMDGPUASI;
};
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 31f728b0c22f..9426df399597 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -42,6 +42,47 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm = "",
field bits<32> Inst = 0xffffffff;
}
+//===---------------------------------------------------------------------===//
+// Return instruction
+//===---------------------------------------------------------------------===//
+
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+ let Namespace = "AMDGPU";
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let Pattern = pattern;
+ let AsmString = !strconcat(asmstr, "\n");
+ let isPseudo = 1;
+ let Itinerary = NullALU;
+ bit hasIEEEFlag = 0;
+ bit hasZeroOpFlag = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let isCodeGenOnly = 1;
+}
+
+def TruePredicate : Predicate<"true">;
+
+// Exists to help track down where SubtargetPredicate isn't set rather
+// than letting tablegen crash with an unhelpful error.
+def InvalidPred : Predicate<"predicate not set on instruction or pattern">;
+
+class PredicateControl {
+ Predicate SubtargetPredicate = InvalidPred;
+ list<Predicate> AssemblerPredicates = [];
+ Predicate AssemblerPredicate = TruePredicate;
+ list<Predicate> OtherPredicates = [];
+ list<Predicate> Predicates = !listconcat([SubtargetPredicate,
+ AssemblerPredicate],
+ AssemblerPredicates,
+ OtherPredicates);
+}
+class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
+ PredicateControl;
+
def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">;
def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">;
def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">;
@@ -52,7 +93,6 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
def FMA : Predicate<"Subtarget->hasFMA()">;
def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
-def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
def u16ImmTarget : AsmOperandClass {
let Name = "U16Imm";
@@ -95,12 +135,6 @@ def brtarget : Operand<OtherVT>;
// Misc. PatFrags
//===----------------------------------------------------------------------===//
-class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag<
- (ops node:$src0),
- (op $src0),
- [{ return N->hasOneUse(); }]
->;
-
class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
(ops node:$src0, node:$src1),
(op $src0, $src1),
@@ -113,8 +147,6 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
[{ return N->hasOneUse(); }]
>;
-def trunc_oneuse : HasOneUseUnaryOp<trunc>;
-
let Properties = [SDNPCommutative, SDNPAssociative] in {
def smax_oneuse : HasOneUseBinOp<smax>;
def smin_oneuse : HasOneUseBinOp<smin>;
@@ -127,6 +159,7 @@ def or_oneuse : HasOneUseBinOp<or>;
def xor_oneuse : HasOneUseBinOp<xor>;
} // Properties = [SDNPCommutative, SDNPAssociative]
+def add_oneuse : HasOneUseBinOp<add>;
def sub_oneuse : HasOneUseBinOp<sub>;
def srl_oneuse : HasOneUseBinOp<srl>;
@@ -240,6 +273,37 @@ def COND_NULL : PatLeaf <
[{(void)N; return false;}]
>;
+//===----------------------------------------------------------------------===//
+// PatLeafs for Texture Constants
+//===----------------------------------------------------------------------===//
+
+def TEX_ARRAY : PatLeaf<
+ (imm),
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
+ return TType == 9 || TType == 10 || TType == 16;
+ }]
+>;
+
+def TEX_RECT : PatLeaf<
+ (imm),
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
+ return TType == 5;
+ }]
+>;
+
+def TEX_SHADOW : PatLeaf<
+ (imm),
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
+ return (TType >= 6 && TType <= 8) || TType == 13;
+ }]
+>;
+
+def TEX_SHADOW_ARRAY : PatLeaf<
+ (imm),
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
+ return TType == 11 || TType == 12 || TType == 17;
+ }]
+>;
//===----------------------------------------------------------------------===//
// Load/Store Pattern Fragments
@@ -249,6 +313,10 @@ class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
}]>;
+class Aligned16Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
+ return cast<MemSDNode>(N)->getAlignment() >= 16;
+}]>;
+
class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>;
class StoreFrag<SDPatternOperator op> : PatFrag <
@@ -361,21 +429,31 @@ def az_extloadi8_local : LocalLoad <az_extloadi8>;
def sextloadi8_local : LocalLoad <sextloadi8>;
def az_extloadi16_local : LocalLoad <az_extloadi16>;
def sextloadi16_local : LocalLoad <sextloadi16>;
+def atomic_load_32_local : LocalLoad<atomic_load_32>;
+def atomic_load_64_local : LocalLoad<atomic_load_64>;
def store_local : LocalStore <store>;
def truncstorei8_local : LocalStore <truncstorei8>;
def truncstorei16_local : LocalStore <truncstorei16>;
def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress;
def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress;
+def atomic_store_local : LocalStore <atomic_store>;
def load_align8_local : Aligned8Bytes <
(ops node:$ptr), (load_local node:$ptr)
>;
+def load_align16_local : Aligned16Bytes <
+ (ops node:$ptr), (load_local node:$ptr)
+>;
+
def store_align8_local : Aligned8Bytes <
(ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
>;
+def store_align16_local : Aligned16Bytes <
+ (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
+>;
def load_flat : FlatLoad <load>;
def az_extloadi8_flat : FlatLoad <az_extloadi8>;
@@ -571,6 +649,18 @@ multiclass BFIPatterns <Instruction BFI_INT,
(BFI_INT $x, $y, $z)
>;
+ // 64-bit version
+ def : AMDGPUPat <
+ (or (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
+ (REG_SEQUENCE RC64,
+ (BFI_INT (i32 (EXTRACT_SUBREG $x, sub0)),
+ (i32 (EXTRACT_SUBREG $y, sub0)),
+ (i32 (EXTRACT_SUBREG $z, sub0))), sub0,
+ (BFI_INT (i32 (EXTRACT_SUBREG $x, sub1)),
+ (i32 (EXTRACT_SUBREG $y, sub1)),
+ (i32 (EXTRACT_SUBREG $z, sub1))), sub1)
+ >;
+
// SHA-256 Ch function
// z ^ (x & (y ^ z))
def : AMDGPUPat <
@@ -578,6 +668,18 @@ multiclass BFIPatterns <Instruction BFI_INT,
(BFI_INT $x, $y, $z)
>;
+ // 64-bit version
+ def : AMDGPUPat <
+ (xor i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
+ (REG_SEQUENCE RC64,
+ (BFI_INT (i32 (EXTRACT_SUBREG $x, sub0)),
+ (i32 (EXTRACT_SUBREG $y, sub0)),
+ (i32 (EXTRACT_SUBREG $z, sub0))), sub0,
+ (BFI_INT (i32 (EXTRACT_SUBREG $x, sub1)),
+ (i32 (EXTRACT_SUBREG $y, sub1)),
+ (i32 (EXTRACT_SUBREG $z, sub1))), sub1)
+ >;
+
def : AMDGPUPat <
(fcopysign f32:$src0, f32:$src1),
(BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1)
@@ -611,10 +713,25 @@ multiclass BFIPatterns <Instruction BFI_INT,
// SHA-256 Ma patterns
// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
-class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : AMDGPUPat <
- (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
- (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
->;
+multiclass SHA256MaPattern <Instruction BFI_INT, Instruction XOR, RegisterClass RC64> {
+ def : AMDGPUPat <
+ (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+ (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
+ >;
+
+ def : AMDGPUPat <
+ (or (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))),
+ (REG_SEQUENCE RC64,
+ (BFI_INT (XOR (i32 (EXTRACT_SUBREG $x, sub0)),
+ (i32 (EXTRACT_SUBREG $y, sub0))),
+ (i32 (EXTRACT_SUBREG $z, sub0)),
+ (i32 (EXTRACT_SUBREG $y, sub0))), sub0,
+ (BFI_INT (XOR (i32 (EXTRACT_SUBREG $x, sub1)),
+ (i32 (EXTRACT_SUBREG $y, sub1))),
+ (i32 (EXTRACT_SUBREG $z, sub1)),
+ (i32 (EXTRACT_SUBREG $y, sub1))), sub1)
+ >;
+}
// Bitfield extract patterns
@@ -633,14 +750,33 @@ multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> {
(UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
>;
+ // x & ((1 << y) - 1)
+ def : AMDGPUPat <
+ (and i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
+ (UBFE $src, (MOV (i32 0)), $width)
+ >;
+
+ // x & ~(-1 << y)
+ def : AMDGPUPat <
+ (and i32:$src, (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
+ (UBFE $src, (MOV (i32 0)), $width)
+ >;
+
+ // x & (-1 >> (bitwidth - y))
+ def : AMDGPUPat <
+ (and i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
+ (UBFE $src, (MOV (i32 0)), $width)
+ >;
+
+ // x << (bitwidth - y) >> (bitwidth - y)
def : AMDGPUPat <
(srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
- (UBFE $src, (i32 0), $width)
+ (UBFE $src, (MOV (i32 0)), $width)
>;
def : AMDGPUPat <
(sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
- (SBFE $src, (i32 0), $width)
+ (SBFE $src, (MOV (i32 0)), $width)
>;
}
@@ -697,11 +833,3 @@ class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
(AMDGPUrcp (fsqrt vt:$src)),
(RsqInst $src)
>;
-
-include "R600Instructions.td"
-include "R700Instructions.td"
-include "EvergreenInstructions.td"
-include "CaymanInstructions.td"
-
-include "SIInstrInfo.td"
-
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 86dc9bd9ea74..896e2055cf62 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -8,7 +8,7 @@
//==-----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU Implementation of the IntrinsicInfo class.
+/// AMDGPU Implementation of the IntrinsicInfo class.
//
//===-----------------------------------------------------------------------===//
@@ -25,13 +25,13 @@ AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
static const char *const IntrinsicNameTable[] = {
#define GET_INTRINSIC_NAME_TABLE
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicImpl.inc"
#undef GET_INTRINSIC_NAME_TABLE
};
namespace {
#define GET_INTRINSIC_ATTRIBUTES
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicImpl.inc"
#undef GET_INTRINSIC_ATTRIBUTES
}
@@ -80,7 +80,7 @@ unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
// Overload Table
#define GET_INTRINSIC_OVERLOAD_TABLE
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicImpl.inc"
#undef GET_INTRINSIC_OVERLOAD_TABLE
}
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
index 6cb8b9644642..ef42f9a319af 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -8,7 +8,7 @@
//==-----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
+/// Interface for the AMDGPU Implementation of the Intrinsic Info class.
//
//===-----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
@@ -24,7 +24,7 @@ namespace AMDGPUIntrinsic {
enum ID {
last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
#define GET_INTRINSIC_ENUM_VALUES
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicEnums.inc"
#undef GET_INTRINSIC_ENUM_VALUES
, num_AMDGPU_intrinsics
};
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index 18c9bd933af2..230a04628504 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -13,7 +13,4 @@
let TargetPrefix = "AMDGPU", isTarget = 1 in {
def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
- def int_AMDGPU_kilp : Intrinsic<[], [], []>;
}
-
-include "SIIntrinsics.td"
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b4704f6feb92..87b072c9ea20 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -12,7 +12,9 @@
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
+#include "AMDGPU.h"
#include "AMDGPULegalizerInfo.h"
+#include "AMDGPUTargetMachine.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
@@ -20,19 +22,46 @@
#include "llvm/Support/Debug.h"
using namespace llvm;
+using namespace LegalizeActions;
-AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
+AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
+ const GCNTargetMachine &TM) {
using namespace TargetOpcode;
- const LLT S1= LLT::scalar(1);
+ auto GetAddrSpacePtr = [&TM](unsigned AS) {
+ return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
+ };
+
+ auto AMDGPUAS = ST.getAMDGPUAS();
+
+ const LLT S1 = LLT::scalar(1);
const LLT V2S16 = LLT::vector(2, 16);
+
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
- const LLT P1 = LLT::pointer(1, 64);
- const LLT P2 = LLT::pointer(2, 64);
+ const LLT S512 = LLT::scalar(512);
+
+ const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
+ const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
+ const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
+ const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS.FLAT_ADDRESS);
+ const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS.PRIVATE_ADDRESS);
+
+ const LLT AddrSpaces[] = {
+ GlobalPtr,
+ ConstantPtr,
+ LocalPtr,
+ FlatPtr,
+ PrivatePtr
+ };
setAction({G_ADD, S32}, Legal);
+ setAction({G_ASHR, S32}, Legal);
+ setAction({G_SUB, S32}, Legal);
+ setAction({G_MUL, S32}, Legal);
setAction({G_AND, S32}, Legal);
+ setAction({G_OR, S32}, Legal);
+ setAction({G_XOR, S32}, Legal);
setAction({G_BITCAST, V2S16}, Legal);
setAction({G_BITCAST, 1, S32}, Legal);
@@ -40,41 +69,88 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
setAction({G_BITCAST, S32}, Legal);
setAction({G_BITCAST, 1, V2S16}, Legal);
+ getActionDefinitionsBuilder(G_FCONSTANT)
+ .legalFor({S32, S64});
+
+ // G_IMPLICIT_DEF is a no-op so we can make it legal for any value type that
+ // can fit in a register.
+ // FIXME: We need to legalize several more operations before we can add
+ // a test case for size > 512.
+ getActionDefinitionsBuilder(G_IMPLICIT_DEF)
+ .legalIf([=](const LegalityQuery &Query) {
+ return Query.Types[0].getSizeInBits() <= 512;
+ })
+ .clampScalar(0, S1, S512);
+
+ getActionDefinitionsBuilder(G_CONSTANT)
+ .legalFor({S1, S32, S64});
+
// FIXME: i1 operands to intrinsics should always be legal, but other i1
// values may not be legal. We need to figure out how to distinguish
// between these two scenarios.
setAction({G_CONSTANT, S1}, Legal);
- setAction({G_CONSTANT, S32}, Legal);
- setAction({G_CONSTANT, S64}, Legal);
-
- setAction({G_FCONSTANT, S32}, Legal);
setAction({G_FADD, S32}, Legal);
+ setAction({G_FCMP, S1}, Legal);
+ setAction({G_FCMP, 1, S32}, Legal);
+ setAction({G_FCMP, 1, S64}, Legal);
+
setAction({G_FMUL, S32}, Legal);
- setAction({G_GEP, P1}, Legal);
- setAction({G_GEP, P2}, Legal);
- setAction({G_GEP, 1, S64}, Legal);
+ setAction({G_ZEXT, S64}, Legal);
+ setAction({G_ZEXT, 1, S32}, Legal);
+
+ setAction({G_FPTOSI, S32}, Legal);
+ setAction({G_FPTOSI, 1, S32}, Legal);
+
+ setAction({G_SITOFP, S32}, Legal);
+ setAction({G_SITOFP, 1, S32}, Legal);
+
+ setAction({G_FPTOUI, S32}, Legal);
+ setAction({G_FPTOUI, 1, S32}, Legal);
+
+ for (LLT PtrTy : AddrSpaces) {
+ LLT IdxTy = LLT::scalar(PtrTy.getSizeInBits());
+ setAction({G_GEP, PtrTy}, Legal);
+ setAction({G_GEP, 1, IdxTy}, Legal);
+ }
setAction({G_ICMP, S1}, Legal);
setAction({G_ICMP, 1, S32}, Legal);
- setAction({G_LOAD, P1}, Legal);
- setAction({G_LOAD, P2}, Legal);
- setAction({G_LOAD, S32}, Legal);
- setAction({G_LOAD, 1, P1}, Legal);
- setAction({G_LOAD, 1, P2}, Legal);
- setAction({G_OR, S32}, Legal);
+ getActionDefinitionsBuilder({G_LOAD, G_STORE})
+ .legalIf([=, &ST](const LegalityQuery &Query) {
+ const LLT &Ty0 = Query.Types[0];
+
+ // TODO: Decompose private loads into 4-byte components.
+ // TODO: Illegal flat loads on SI
+ switch (Ty0.getSizeInBits()) {
+ case 32:
+ case 64:
+ case 128:
+ return true;
+
+ case 96:
+ // XXX hasLoadX3
+ return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS);
+
+ case 256:
+ case 512:
+ // TODO: constant loads
+ default:
+ return false;
+ }
+ });
+
+
setAction({G_SELECT, S32}, Legal);
setAction({G_SELECT, 1, S1}, Legal);
setAction({G_SHL, S32}, Legal);
- setAction({G_STORE, S32}, Legal);
- setAction({G_STORE, 1, P1}, Legal);
// FIXME: When RegBankSelect inserts copies, it will only create new
// registers with scalar types. This means we can end up with
@@ -83,8 +159,54 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
// if it sees a generic instruction which isn't legal, so we need to
// tell it that scalar types are legal for pointer operands
setAction({G_GEP, S64}, Legal);
- setAction({G_LOAD, 1, S64}, Legal);
- setAction({G_STORE, 1, S64}, Legal);
+
+ for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
+ getActionDefinitionsBuilder(Op)
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT &VecTy = Query.Types[1];
+ const LLT &IdxTy = Query.Types[2];
+ return VecTy.getSizeInBits() % 32 == 0 &&
+ VecTy.getSizeInBits() <= 512 &&
+ IdxTy.getSizeInBits() == 32;
+ });
+ }
+
+ // FIXME: Doesn't handle extract of illegal sizes.
+ getActionDefinitionsBuilder({G_EXTRACT, G_INSERT})
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT &Ty0 = Query.Types[0];
+ const LLT &Ty1 = Query.Types[1];
+ return (Ty0.getSizeInBits() % 32 == 0) &&
+ (Ty1.getSizeInBits() % 32 == 0);
+ });
+
+ // Merge/Unmerge
+ for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
+ unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
+ unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
+
+ getActionDefinitionsBuilder(Op)
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT &BigTy = Query.Types[BigTyIdx];
+ const LLT &LitTy = Query.Types[LitTyIdx];
+ return BigTy.getSizeInBits() % 32 == 0 &&
+ LitTy.getSizeInBits() % 32 == 0 &&
+ BigTy.getSizeInBits() <= 512;
+ })
+ // Any vectors left are the wrong size. Scalarize them.
+ .fewerElementsIf([](const LegalityQuery &Query) { return true; },
+ [](const LegalityQuery &Query) {
+ return std::make_pair(
+ 0, Query.Types[0].getElementType());
+ })
+ .fewerElementsIf([](const LegalityQuery &Query) { return true; },
+ [](const LegalityQuery &Query) {
+ return std::make_pair(
+ 1, Query.Types[1].getElementType());
+ });
+
+ }
computeTables();
+ verify(*ST.getInstrInfo());
}
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 291e3361f163..1cbd37c42c4b 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -19,12 +19,15 @@
namespace llvm {
+class GCNTargetMachine;
class LLVMContext;
+class GCNSubtarget;
/// This class provides the information for the target register banks.
class AMDGPULegalizerInfo : public LegalizerInfo {
public:
- AMDGPULegalizerInfo();
+ AMDGPULegalizerInfo(const GCNSubtarget &ST,
+ const GCNTargetMachine &TM);
};
} // End llvm namespace.
#endif
diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index f594767c8edb..7a7ed7a4f065 100644
--- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This file does AMD library function optimizations.
+/// This file does AMD library function optimizations.
//
//===----------------------------------------------------------------------===//
@@ -765,8 +765,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
ArrayRef<double> tmp(DVal);
nval = ConstantDataVector::get(context, tmp);
}
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> " << *nval << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
replaceCall(nval);
return true;
}
@@ -776,8 +775,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
for (int i = 0; i < sz; ++i) {
if (CF->isExactlyValue(ftbl[i].input)) {
Value *nval = ConstantFP::get(CF->getType(), ftbl[i].result);
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> " << *nval << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
replaceCall(nval);
return true;
}
@@ -798,11 +796,11 @@ bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) {
AMDGPULibFunc nf = FInfo;
nf.setPrefix(AMDGPULibFunc::NATIVE);
if (Constant *FPExpr = getFunction(M, nf)) {
- DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
+ LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
CI->setCalledFunction(FPExpr);
- DEBUG(dbgs() << *CI << '\n');
+ LLVM_DEBUG(dbgs() << *CI << '\n');
return true;
}
@@ -820,8 +818,7 @@ bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B,
Value *nval = B.CreateFDiv(ConstantFP::get(CF->getType(), 1.0),
opr0,
"recip2div");
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> " << *nval << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
replaceCall(nval);
return true;
}
@@ -899,7 +896,7 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0) || CZero) {
// pow/powr/pown(x, 0) == 1
- DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n");
Constant *cnval = ConstantFP::get(eltType, 1.0);
if (getVecSize(FInfo) > 1) {
cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
@@ -909,23 +906,21 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
}
if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) {
// pow/powr/pown(x, 1.0) = x
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> " << *opr0 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n");
replaceCall(opr0);
return true;
}
if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) {
// pow/powr/pown(x, 2.0) = x*x
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> " << *opr0 << " * " << *opr0 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * " << *opr0
+ << "\n");
Value *nval = B.CreateFMul(opr0, opr0, "__pow2");
replaceCall(nval);
return true;
}
if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) {
// pow/powr/pown(x, -1.0) = 1.0/x
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> 1 / " << *opr0 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1 / " << *opr0 << "\n");
Constant *cnval = ConstantFP::get(eltType, 1.0);
if (getVecSize(FInfo) > 1) {
cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
@@ -942,8 +937,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
if (Constant *FPExpr = getFunction(M,
AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
: AMDGPULibFunc::EI_RSQRT, FInfo))) {
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << FInfo.getName().c_str() << "(" << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << FInfo.getName().c_str() << "(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt"
: "__pow2rsqrt");
replaceCall(nval);
@@ -999,8 +994,9 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
}
nval = B.CreateFDiv(cnval, nval, "__1powprod");
}
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
+ << ")\n");
replaceCall(nval);
return true;
}
@@ -1137,8 +1133,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
nval = B.CreateBitCast(nval, opr0->getType());
}
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
replaceCall(nval);
return true;
@@ -1155,8 +1151,7 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
}
int ci_opr1 = (int)CINT->getSExtValue();
if (ci_opr1 == 1) { // rootn(x, 1) = x
- DEBUG(errs() << "AMDIC: " << *CI
- << " ---> " << *opr0 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n");
replaceCall(opr0);
return true;
}
@@ -1166,7 +1161,7 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
Module *M = CI->getModule();
if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT,
FInfo))) {
- DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt");
replaceCall(nval);
return true;
@@ -1175,13 +1170,13 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
Module *M = CI->getModule();
if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT,
FInfo))) {
- DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt");
replaceCall(nval);
return true;
}
} else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
- DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n");
Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0),
opr0,
"__rootn2div");
@@ -1193,7 +1188,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
Module *M = CI->getModule();
if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT,
FInfo))) {
- DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0
+ << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt");
replaceCall(nval);
return true;
@@ -1212,22 +1208,22 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) {
// fma/mad(a, b, c) = c if a=0 || b=0
- DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
replaceCall(opr2);
return true;
}
if (CF0 && CF0->isExactlyValue(1.0f)) {
// fma/mad(a, b, c) = b+c if a=1
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << *opr1 << " + " << *opr2 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr1 << " + " << *opr2
+ << "\n");
Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd");
replaceCall(nval);
return true;
}
if (CF1 && CF1->isExactlyValue(1.0f)) {
// fma/mad(a, b, c) = a+c if b=1
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << *opr0 << " + " << *opr2 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " + " << *opr2
+ << "\n");
Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd");
replaceCall(nval);
return true;
@@ -1235,8 +1231,8 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
if (ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) {
if (CF->isZero()) {
// fma/mad(a, b, c) = a*b if c=0
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << *opr0 << " * " << *opr1 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * "
+ << *opr1 << "\n");
Value *nval = B.CreateFMul(opr0, opr1, "fmamul");
replaceCall(nval);
return true;
@@ -1263,8 +1259,8 @@ bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B,
if (Constant *FPExpr = getNativeFunction(
CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
Value *opr0 = CI->getArgOperand(0);
- DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << "sqrt(" << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << "sqrt(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt");
replaceCall(nval);
return true;
@@ -1355,8 +1351,8 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
P = B.CreateAddrSpaceCast(Alloc, PTy);
CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
- DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI
- << ") with " << *Call << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI << ") with "
+ << *Call << "\n");
if (!isSin) { // CI->cos, UI->sin
B.SetInsertPoint(&*ItOld);
@@ -1719,9 +1715,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
bool Changed = false;
auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- DEBUG(dbgs() << "AMDIC: process function ";
- F.printAsOperand(dbgs(), false, F.getParent());
- dbgs() << '\n';);
+ LLVM_DEBUG(dbgs() << "AMDIC: process function ";
+ F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
if (!EnablePreLink)
Changed |= setFastFlags(F, Options);
@@ -1737,8 +1732,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
Function *Callee = CI->getCalledFunction();
if (Callee == 0) continue;
- DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
- dbgs().flush());
+ LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
+ dbgs().flush());
if(Simplifier.fold(CI, AA))
Changed = true;
}
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.h b/lib/Target/AMDGPU/AMDGPULibFunc.h
index 5405bc645714..fe062384800a 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -1,4 +1,4 @@
-//===-- AMDGPULibFunc.h ---------------------------------------------------===//
+//===-- AMDGPULibFunc.h ----------------------------------------*- C++ -*--===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 7e0e9802c0e6..2cec8fe53283 100644
--- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -117,7 +117,6 @@ bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
return false;
const TargetMachine &TM = TPC->getTM<TargetMachine>();
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(F);
bool Changed = false;
for (auto *U : F.users()) {
@@ -125,7 +124,7 @@ bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
if (!CI)
continue;
- Changed |= ST.makeLIDRangeMetadata(CI);
+ Changed |= AMDGPUSubtarget::get(TM, F).makeLIDRangeMetadata(CI);
}
return Changed;
}
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
new file mode 100644
index 000000000000..8cc7e38f7b29
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -0,0 +1,264 @@
+//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass replaces accesses to kernel arguments with loads from
+/// offsets from the kernarg base pointer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerKernelArguments : public FunctionPass{
+public:
+ static char ID;
+
+ AMDGPULowerKernelArguments() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesAll();
+ }
+};
+
+} // end anonymous namespace
+
+bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
+ CallingConv::ID CC = F.getCallingConv();
+ if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
+ return false;
+
+ auto &TPC = getAnalysis<TargetPassConfig>();
+
+ const TargetMachine &TM = TPC.getTM<TargetMachine>();
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ LLVMContext &Ctx = F.getParent()->getContext();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ BasicBlock &EntryBlock = *F.begin();
+ IRBuilder<> Builder(&*EntryBlock.begin());
+
+ const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
+ const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
+
+ unsigned MaxAlign;
+ // FIXME: Alignment is broken broken with explicit arg offset.;
+ const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
+ if (TotalKernArgSize == 0)
+ return false;
+
+ CallInst *KernArgSegment =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr,
+ F.getName() + ".kernarg.segment");
+
+ KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+ KernArgSegment->addAttribute(AttributeList::ReturnIndex,
+ Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
+
+ unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
+ uint64_t ExplicitArgOffset = 0;
+
+ for (Argument &Arg : F.args()) {
+ Type *ArgTy = Arg.getType();
+ unsigned Align = DL.getABITypeAlignment(ArgTy);
+ unsigned Size = DL.getTypeSizeInBits(ArgTy);
+ unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+
+
+ // Clover seems to always pad i8/i16 to i32, but doesn't properly align
+ // them?
+ // Make sure the struct elements have correct size and alignment for ext
+ // args. These seem to be padded up to 4-bytes but not correctly aligned.
+ bool IsExtArg = AllocSize < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
+ !ST.isAmdHsaOS();
+ if (IsExtArg)
+ AllocSize = 4;
+
+ uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+
+ if (Arg.use_empty())
+ continue;
+
+ if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
+ // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
+ // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
+ // can't represent this with range metadata because it's only allowed for
+ // integer types.
+ if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+ ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ continue;
+
+ // FIXME: We can replace this with equivalent alias.scope/noalias
+ // metadata, but this appears to be a lot of work.
+ if (Arg.hasNoAliasAttr())
+ continue;
+ }
+
+ VectorType *VT = dyn_cast<VectorType>(ArgTy);
+ bool IsV3 = VT && VT->getNumElements() == 3;
+ VectorType *V4Ty = nullptr;
+
+ int64_t AlignDownOffset = alignDown(EltOffset, 4);
+ int64_t OffsetDiff = EltOffset - AlignDownOffset;
+ unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
+
+ Value *ArgPtr;
+ if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types
+ // Since we don't have sub-dword scalar loads, avoid doing an extload by
+ // loading earlier than the argument address, and extracting the relevant
+ // bits.
+ //
+ // Additionally widen any sub-dword load to i32 even if suitably aligned,
+ // so that CSE between different argument loads works easily.
+
+ ArgPtr = Builder.CreateConstInBoundsGEP1_64(
+ KernArgSegment,
+ AlignDownOffset,
+ Arg.getName() + ".kernarg.offset.align.down");
+ ArgPtr = Builder.CreateBitCast(ArgPtr,
+ Builder.getInt32Ty()->getPointerTo(AS),
+ ArgPtr->getName() + ".cast");
+ } else {
+ ArgPtr = Builder.CreateConstInBoundsGEP1_64(
+ KernArgSegment,
+ AlignDownOffset,
+ Arg.getName() + ".kernarg.offset");
+ ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS),
+ ArgPtr->getName() + ".cast");
+ }
+
+ assert((!IsExtArg || !IsV3) && "incompatible situation");
+
+ if (IsV3 && Size >= 32) {
+ V4Ty = VectorType::get(VT->getVectorElementType(), 4);
+ // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
+ ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS));
+ }
+
+ LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign);
+ Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
+
+ MDBuilder MDB(Ctx);
+
+ if (isa<PointerType>(ArgTy)) {
+ if (Arg.hasNonNullAttr())
+ Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
+
+ uint64_t DerefBytes = Arg.getDereferenceableBytes();
+ if (DerefBytes != 0) {
+ Load->setMetadata(
+ LLVMContext::MD_dereferenceable,
+ MDNode::get(Ctx,
+ MDB.createConstant(
+ ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
+ }
+
+ uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
+ if (DerefOrNullBytes != 0) {
+ Load->setMetadata(
+ LLVMContext::MD_dereferenceable_or_null,
+ MDNode::get(Ctx,
+ MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
+ DerefOrNullBytes))));
+ }
+
+ unsigned ParamAlign = Arg.getParamAlignment();
+ if (ParamAlign != 0) {
+ Load->setMetadata(
+ LLVMContext::MD_align,
+ MDNode::get(Ctx,
+ MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
+ ParamAlign))));
+ }
+ }
+
+ // TODO: Convert noalias arg to !noalias
+
+ if (Size < 32 && !ArgTy->isAggregateType()) {
+ if (IsExtArg && OffsetDiff == 0) {
+ Type *I32Ty = Builder.getInt32Ty();
+ bool IsSext = Arg.hasSExtAttr();
+ Metadata *LowAndHigh[] = {
+ ConstantAsMetadata::get(
+ ConstantInt::get(I32Ty, IsSext ? minIntN(Size) : 0)),
+ ConstantAsMetadata::get(
+ ConstantInt::get(I32Ty,
+ IsSext ? maxIntN(Size) + 1 : maxUIntN(Size) + 1))
+ };
+
+ Load->setMetadata(LLVMContext::MD_range, MDNode::get(Ctx, LowAndHigh));
+ }
+
+ Value *ExtractBits = OffsetDiff == 0 ?
+ Load : Builder.CreateLShr(Load, OffsetDiff * 8);
+
+ IntegerType *ArgIntTy = Builder.getIntNTy(Size);
+ Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
+ Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
+ Arg.getName() + ".load");
+ Arg.replaceAllUsesWith(NewVal);
+ } else if (IsV3) {
+ Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
+ {0, 1, 2},
+ Arg.getName() + ".load");
+ Arg.replaceAllUsesWith(Shuf);
+ } else {
+ Load->setName(Arg.getName() + ".load");
+ Arg.replaceAllUsesWith(Load);
+ }
+ }
+
+ KernArgSegment->addAttribute(
+ AttributeList::ReturnIndex,
+ Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
+
+ return true;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
+ "AMDGPU Lower Kernel Arguments", false, false)
+INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
+ false, false)
+
+char AMDGPULowerKernelArguments::ID = 0;
+
+FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
+ return new AMDGPULowerKernelArguments();
+}
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
new file mode 100644
index 000000000000..a43dcef4cf0b
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -0,0 +1,270 @@
+//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass does attempts to make use of reqd_work_group_size metadata
+/// to eliminate loads from the dispatch packet and to constant fold OpenCL
+/// get_local_size-like functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+
+#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
+
+using namespace llvm;
+
+namespace {
+
+// Field offsets in hsa_kernel_dispatch_packet_t.
+enum DispatchPackedOffsets {
+ WORKGROUP_SIZE_X = 4,
+ WORKGROUP_SIZE_Y = 6,
+ WORKGROUP_SIZE_Z = 8,
+
+ GRID_SIZE_X = 12,
+ GRID_SIZE_Y = 16,
+ GRID_SIZE_Z = 20
+};
+
+class AMDGPULowerKernelAttributes : public ModulePass {
+ Module *Mod = nullptr;
+
+public:
+ static char ID;
+
+ AMDGPULowerKernelAttributes() : ModulePass(ID) {}
+
+ bool processUse(CallInst *CI);
+
+ bool doInitialization(Module &M) override;
+ bool runOnModule(Module &M) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Kernel Attributes";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+};
+
+} // end anonymous namespace
+
+bool AMDGPULowerKernelAttributes::doInitialization(Module &M) {
+ Mod = &M;
+ return false;
+}
+
+bool AMDGPULowerKernelAttributes::processUse(CallInst *CI) {
+ Function *F = CI->getParent()->getParent();
+
+ auto MD = F->getMetadata("reqd_work_group_size");
+ const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
+
+ const bool HasUniformWorkGroupSize =
+ F->getFnAttribute("uniform-work-group-size").getValueAsString() == "true";
+
+ if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
+ return false;
+
+ Value *WorkGroupSizeX = nullptr;
+ Value *WorkGroupSizeY = nullptr;
+ Value *WorkGroupSizeZ = nullptr;
+
+ Value *GridSizeX = nullptr;
+ Value *GridSizeY = nullptr;
+ Value *GridSizeZ = nullptr;
+
+ const DataLayout &DL = Mod->getDataLayout();
+
+ // We expect to see several GEP users, casted to the appropriate type and
+ // loaded.
+ for (User *U : CI->users()) {
+ if (!U->hasOneUse())
+ continue;
+
+ int64_t Offset = 0;
+ if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
+ continue;
+
+ auto *BCI = dyn_cast<BitCastInst>(*U->user_begin());
+ if (!BCI || !BCI->hasOneUse())
+ continue;
+
+ auto *Load = dyn_cast<LoadInst>(*BCI->user_begin());
+ if (!Load || !Load->isSimple())
+ continue;
+
+ unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
+
+ // TODO: Handle merged loads.
+ switch (Offset) {
+ case WORKGROUP_SIZE_X:
+ if (LoadSize == 2)
+ WorkGroupSizeX = Load;
+ break;
+ case WORKGROUP_SIZE_Y:
+ if (LoadSize == 2)
+ WorkGroupSizeY = Load;
+ break;
+ case WORKGROUP_SIZE_Z:
+ if (LoadSize == 2)
+ WorkGroupSizeZ = Load;
+ break;
+ case GRID_SIZE_X:
+ if (LoadSize == 4)
+ GridSizeX = Load;
+ break;
+ case GRID_SIZE_Y:
+ if (LoadSize == 4)
+ GridSizeY = Load;
+ break;
+ case GRID_SIZE_Z:
+ if (LoadSize == 4)
+ GridSizeZ = Load;
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Pattern match the code used to handle partial workgroup dispatches in the
+ // library implementation of get_local_size, so the entire function can be
+ // constant folded with a known group size.
+ //
+ // uint r = grid_size - group_id * group_size;
+ // get_local_size = (r < group_size) ? r : group_size;
+ //
+ // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
+ // the grid_size is required to be a multiple of group_size). In this case:
+ //
+ // grid_size - (group_id * group_size) < group_size
+ // ->
+ // grid_size < group_size + (group_id * group_size)
+ //
+ // (grid_size / group_size) < 1 + group_id
+ //
+ // grid_size / group_size is at least 1, so we can conclude the select
+ // condition is false (except for group_id == 0, where the select result is
+ // the same).
+
+ bool MadeChange = false;
+ Value *WorkGroupSizes[3] = { WorkGroupSizeX, WorkGroupSizeY, WorkGroupSizeZ };
+ Value *GridSizes[3] = { GridSizeX, GridSizeY, GridSizeZ };
+
+ for (int I = 0; HasUniformWorkGroupSize && I < 3; ++I) {
+ Value *GroupSize = WorkGroupSizes[I];
+ Value *GridSize = GridSizes[I];
+ if (!GroupSize || !GridSize)
+ continue;
+
+ for (User *U : GroupSize->users()) {
+ auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
+ if (!ZextGroupSize)
+ continue;
+
+ for (User *ZextUser : ZextGroupSize->users()) {
+ auto *SI = dyn_cast<SelectInst>(ZextUser);
+ if (!SI)
+ continue;
+
+ using namespace llvm::PatternMatch;
+ auto GroupIDIntrin = I == 0 ?
+ m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() :
+ (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() :
+ m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+
+ auto SubExpr = m_Sub(m_Specific(GridSize),
+ m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize)));
+
+ ICmpInst::Predicate Pred;
+ if (match(SI,
+ m_Select(m_ICmp(Pred, SubExpr, m_Specific(ZextGroupSize)),
+ SubExpr,
+ m_Specific(ZextGroupSize))) &&
+ Pred == ICmpInst::ICMP_ULT) {
+ if (HasReqdWorkGroupSize) {
+ ConstantInt *KnownSize
+ = mdconst::extract<ConstantInt>(MD->getOperand(I));
+ SI->replaceAllUsesWith(ConstantExpr::getIntegerCast(KnownSize,
+ SI->getType(),
+ false));
+ } else {
+ SI->replaceAllUsesWith(ZextGroupSize);
+ }
+
+ MadeChange = true;
+ }
+ }
+ }
+ }
+
+ if (!HasReqdWorkGroupSize)
+ return MadeChange;
+
+ // Eliminate any other loads we can from the dispatch packet.
+ for (int I = 0; I < 3; ++I) {
+ Value *GroupSize = WorkGroupSizes[I];
+ if (!GroupSize)
+ continue;
+
+ ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
+ GroupSize->replaceAllUsesWith(
+ ConstantExpr::getIntegerCast(KnownSize,
+ GroupSize->getType(),
+ false));
+ MadeChange = true;
+ }
+
+ return MadeChange;
+}
+
+// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
+// TargetPassConfig for subtarget.
+bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
+ StringRef DispatchPtrName
+ = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
+
+ Function *DispatchPtr = Mod->getFunction(DispatchPtrName);
+ if (!DispatchPtr) // Dispatch ptr not used.
+ return false;
+
+ bool MadeChange = false;
+
+ SmallPtrSet<Instruction *, 4> HandledUses;
+ for (auto *U : DispatchPtr->users()) {
+ CallInst *CI = cast<CallInst>(U);
+ if (HandledUses.insert(CI).second) {
+ if (processUse(CI))
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
+ "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU IR optimizations",
+ false, false)
+
+char AMDGPULowerKernelAttributes::ID = 0;
+
+ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
+ return new AMDGPULowerKernelAttributes();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 23fd8113932c..1876dc3f7122 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -8,16 +8,17 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
+/// Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
//
//===----------------------------------------------------------------------===//
//
-#include "AMDGPUMCInstLower.h"
#include "AMDGPUAsmPrinter.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "R600AsmPrinter.h"
#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -36,9 +37,43 @@
using namespace llvm;
+namespace {
+
+class AMDGPUMCInstLower {
+ MCContext &Ctx;
+ const TargetSubtargetInfo &ST;
+ const AsmPrinter &AP;
+
+ const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
+ const MachineOperand &MO) const;
+
+public:
+ AMDGPUMCInstLower(MCContext &ctx, const TargetSubtargetInfo &ST,
+ const AsmPrinter &AP);
+
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+
+ /// Lower a MachineInstr to an MCInst
+ void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+};
+
+class R600MCInstLower : public AMDGPUMCInstLower {
+public:
+ R600MCInstLower(MCContext &ctx, const R600Subtarget &ST,
+ const AsmPrinter &AP);
+
+ /// Lower a MachineInstr to an MCInst
+ void lower(const MachineInstr *MI, MCInst &OutMI) const;
+};
+
+
+} // End anonymous namespace
+
#include "AMDGPUGenMCPseudoLowering.inc"
-AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st,
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx,
+ const TargetSubtargetInfo &st,
const AsmPrinter &ap):
Ctx(ctx), ST(st), AP(ap) { }
@@ -129,7 +164,7 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
unsigned Opcode = MI->getOpcode();
- const auto *TII = ST.getInstrInfo();
+ const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
// FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
// need to select it to the subtarget specific version, and there's no way to
@@ -169,16 +204,18 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
MCOperand &MCOp) const {
- const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+ const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
return MCInstLowering.lowerOperand(MO, MCOp);
}
-const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
+static const MCExpr *lowerAddrSpaceCast(const TargetMachine &TM,
+ const Constant *CV,
+ MCContext &OutContext) {
// TargetMachine does not support llvm-style cast. Use C++-style cast.
// This is safe since TM is always of type AMDGPUTargetMachine or its
// derived class.
- auto *AT = static_cast<AMDGPUTargetMachine*>(&TM);
+ auto &AT = static_cast<const AMDGPUTargetMachine&>(TM);
auto *CE = dyn_cast<ConstantExpr>(CV);
// Lower null pointers in private and local address space.
@@ -187,12 +224,18 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
if (CE && CE->getOpcode() == Instruction::AddrSpaceCast) {
auto Op = CE->getOperand(0);
auto SrcAddr = Op->getType()->getPointerAddressSpace();
- if (Op->isNullValue() && AT->getNullPointerValue(SrcAddr) == 0) {
+ if (Op->isNullValue() && AT.getNullPointerValue(SrcAddr) == 0) {
auto DstAddr = CE->getType()->getPointerAddressSpace();
- return MCConstantExpr::create(AT->getNullPointerValue(DstAddr),
+ return MCConstantExpr::create(AT.getNullPointerValue(DstAddr),
OutContext);
}
}
+ return nullptr;
+}
+
+const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
+ if (const MCExpr *E = lowerAddrSpaceCast(TM, CV, OutContext))
+ return E;
return AsmPrinter::lowerConstant(CV);
}
@@ -200,7 +243,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
if (emitPseudoExpansionLowering(*OutStreamer, MI))
return;
- const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+ const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
StringRef Err;
@@ -292,3 +335,47 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
}
}
+
+R600MCInstLower::R600MCInstLower(MCContext &Ctx, const R600Subtarget &ST,
+ const AsmPrinter &AP) :
+ AMDGPUMCInstLower(Ctx, ST, AP) { }
+
+void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+ for (const MachineOperand &MO : MI->explicit_operands()) {
+ MCOperand MCOp;
+ lowerOperand(MO, MCOp);
+ OutMI.addOperand(MCOp);
+ }
+}
+
+void R600AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>();
+ R600MCInstLower MCInstLowering(OutContext, STI, *this);
+
+ StringRef Err;
+ if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
+ LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+ C.emitError("Illegal instruction detected: " + Err);
+ MI->print(errs());
+ }
+
+ if (MI->isBundle()) {
+ const MachineBasicBlock *MBB = MI->getParent();
+ MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
+ while (I != MBB->instr_end() && I->isInsideBundle()) {
+ EmitInstruction(&*I);
+ ++I;
+ }
+ } else {
+ MCInst TmpInst;
+ MCInstLowering.lower(MI, TmpInst);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ }
+}
+
+const MCExpr *R600AsmPrinter::lowerConstant(const Constant *CV) {
+ if (const MCExpr *E = lowerAddrSpaceCast(TM, CV, OutContext))
+ return E;
+ return AsmPrinter::lowerConstant(CV);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/lib/Target/AMDGPU/AMDGPUMCInstLower.h
deleted file mode 100644
index 57d2d85daecd..000000000000
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ /dev/null
@@ -1,46 +0,0 @@
-//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
-
-namespace llvm {
-
-class AMDGPUSubtarget;
-class AsmPrinter;
-class MachineBasicBlock;
-class MachineInstr;
-class MachineOperand;
-class MCContext;
-class MCExpr;
-class MCInst;
-class MCOperand;
-
-class AMDGPUMCInstLower {
- MCContext &Ctx;
- const AMDGPUSubtarget &ST;
- const AsmPrinter &AP;
-
- const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
- const MachineOperand &MO) const;
-
-public:
- AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST,
- const AsmPrinter &AP);
-
- bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
-
- /// \brief Lower a MachineInstr to an MCInst
- void lower(const MachineInstr *MI, MCInst &OutMI) const;
-
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 20918233e447..6f44e2dbb2d5 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -31,6 +31,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/Pass.h"
#include "llvm/Support/Compiler.h"
@@ -658,7 +659,7 @@ RegionMRT *MRT::buildMRT(MachineFunction &MF,
continue;
}
- DEBUG(dbgs() << "Visiting " << printMBBReference(*MBB) << "\n");
+ LLVM_DEBUG(dbgs() << "Visiting " << printMBBReference(*MBB) << "\n");
MBBMRT *NewMBB = new MBBMRT(MBB);
MachineRegion *Region = RegionInfo->getRegionFor(MBB);
@@ -695,18 +696,19 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
if (TRI->isVirtualRegister(Reg)) {
- DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
+ << "\n");
// If this is a source register to a PHI we are chaining, it
// must be live out.
if (PHIInfo.isSource(Reg)) {
- DEBUG(dbgs() << "Add LiveOut (PHI): " << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Add LiveOut (PHI): " << printReg(Reg, TRI) << "\n");
addLiveOut(Reg);
} else {
// If this is live out of the MBB
for (auto &UI : MRI->use_operands(Reg)) {
if (UI.getParent()->getParent() != MBB) {
- DEBUG(dbgs() << "Add LiveOut (MBB " << printMBBReference(*MBB)
- << "): " << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Add LiveOut (MBB " << printMBBReference(*MBB)
+ << "): " << printReg(Reg, TRI) << "\n");
addLiveOut(Reg);
} else {
// If the use is in the same MBB we have to make sure
@@ -717,8 +719,8 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
MIE = UseInstr->getParent()->instr_end();
MII != MIE; ++MII) {
if ((&(*MII)) == DefInstr) {
- DEBUG(dbgs() << "Add LiveOut (Loop): " << printReg(Reg, TRI)
- << "\n");
+ LLVM_DEBUG(dbgs() << "Add LiveOut (Loop): " << printReg(Reg, TRI)
+ << "\n");
addLiveOut(Reg);
}
}
@@ -734,11 +736,12 @@ void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
if (TRI->isVirtualRegister(Reg)) {
- DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
+ << "\n");
for (auto &UI : MRI->use_operands(Reg)) {
if (!Region->contains(UI.getParent()->getParent())) {
- DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
- << "): " << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
+ << "): " << printReg(Reg, TRI) << "\n");
addLiveOut(Reg);
}
}
@@ -749,8 +752,8 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
const MachineRegisterInfo *MRI,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
- DEBUG(dbgs() << "-Store Live Outs Begin (" << printMBBReference(*MBB)
- << ")-\n");
+ LLVM_DEBUG(dbgs() << "-Store Live Outs Begin (" << printMBBReference(*MBB)
+ << ")-\n");
for (auto &II : *MBB) {
for (auto &RI : II.defs()) {
storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo);
@@ -774,9 +777,10 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
for (int i = 0; i < numPreds; ++i) {
if (getPHIPred(PHI, i) == MBB) {
unsigned PHIReg = getPHISourceReg(PHI, i);
- DEBUG(dbgs() << "Add LiveOut (PhiSource " << printMBBReference(*MBB)
- << " -> " << printMBBReference(*(*SI))
- << "): " << printReg(PHIReg, TRI) << "\n");
+ LLVM_DEBUG(dbgs()
+ << "Add LiveOut (PhiSource " << printMBBReference(*MBB)
+ << " -> " << printMBBReference(*(*SI))
+ << "): " << printReg(PHIReg, TRI) << "\n");
addLiveOut(PHIReg);
}
}
@@ -784,7 +788,7 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
}
}
- DEBUG(dbgs() << "-Store Live Outs Endn-\n");
+ LLVM_DEBUG(dbgs() << "-Store Live Outs Endn-\n");
}
void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB,
@@ -844,8 +848,8 @@ void LinearizedRegion::storeLiveOuts(RegionMRT *Region,
for (int i = 0; i < numPreds; ++i) {
if (Region->contains(getPHIPred(PHI, i))) {
unsigned PHIReg = getPHISourceReg(PHI, i);
- DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
- << "): " << printReg(PHIReg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
+ << "): " << printReg(PHIReg, TRI) << "\n");
addLiveOut(PHIReg);
}
}
@@ -909,20 +913,21 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
bool IncludeLoopPHI) {
assert(Register != NewRegister && "Cannot replace a reg with itself");
- DEBUG(dbgs() << "Pepareing to replace register (region): "
- << printReg(Register, MRI->getTargetRegisterInfo()) << " with "
- << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
+ LLVM_DEBUG(
+ dbgs() << "Pepareing to replace register (region): "
+ << printReg(Register, MRI->getTargetRegisterInfo()) << " with "
+ << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
// If we are replacing outside, we also need to update the LiveOuts
if (ReplaceOutside &&
(isLiveOut(Register) || this->getParent()->isLiveOut(Register))) {
LinearizedRegion *Current = this;
while (Current != nullptr && Current->getEntry() != nullptr) {
- DEBUG(dbgs() << "Region before register replace\n");
- DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+ LLVM_DEBUG(dbgs() << "Region before register replace\n");
+ LLVM_DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
Current->replaceLiveOut(Register, NewRegister);
- DEBUG(dbgs() << "Region after register replace\n");
- DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+ LLVM_DEBUG(dbgs() << "Region after register replace\n");
+ LLVM_DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
Current = Current->getParent();
}
}
@@ -946,16 +951,16 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
if (ShouldReplace) {
if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
- DEBUG(dbgs() << "Trying to substitute physical register: "
- << printReg(NewRegister, MRI->getTargetRegisterInfo())
- << "\n");
+ LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
+ << printReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
llvm_unreachable("Cannot substitute physical registers");
} else {
- DEBUG(dbgs() << "Replacing register (region): "
- << printReg(Register, MRI->getTargetRegisterInfo())
- << " with "
- << printReg(NewRegister, MRI->getTargetRegisterInfo())
- << "\n");
+ LLVM_DEBUG(dbgs() << "Replacing register (region): "
+ << printReg(Register, MRI->getTargetRegisterInfo())
+ << " with "
+ << printReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
O.setReg(NewRegister);
}
}
@@ -1022,18 +1027,18 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
if (hasNoDef(Reg, MRI))
continue;
if (!MRI->hasOneDef(Reg)) {
- DEBUG(this->getEntry()->getParent()->dump());
- DEBUG(dbgs() << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(this->getEntry()->getParent()->dump());
+ LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << "\n");
}
if (MRI->def_begin(Reg) == MRI->def_end()) {
- DEBUG(dbgs() << "Register "
- << printReg(Reg, MRI->getTargetRegisterInfo())
- << " has NO defs\n");
+ LLVM_DEBUG(dbgs() << "Register "
+ << printReg(Reg, MRI->getTargetRegisterInfo())
+ << " has NO defs\n");
} else if (!MRI->hasOneDef(Reg)) {
- DEBUG(dbgs() << "Register "
- << printReg(Reg, MRI->getTargetRegisterInfo())
- << " has multiple defs\n");
+ LLVM_DEBUG(dbgs() << "Register "
+ << printReg(Reg, MRI->getTargetRegisterInfo())
+ << " has multiple defs\n");
}
assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
@@ -1041,8 +1046,8 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
MachineOperand *UseOperand = &(RI);
bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB;
if (UseIsOutsideDefMBB && UseOperand->isKill()) {
- DEBUG(dbgs() << "Removing kill flag on register: "
- << printReg(Reg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "Removing kill flag on register: "
+ << printReg(Reg, TRI) << "\n");
UseOperand->setIsKill(false);
}
}
@@ -1415,8 +1420,8 @@ void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) {
MachineInstr &Instr = *I;
if (Instr.isPHI()) {
unsigned PHIDestReg = getPHIDestReg(Instr);
- DEBUG(dbgs() << "Extractking killed phi:\n");
- DEBUG(Instr.dump());
+ LLVM_DEBUG(dbgs() << "Extractking killed phi:\n");
+ LLVM_DEBUG(Instr.dump());
PHIs.insert(&Instr);
PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc());
storePHILinearizationInfoDest(PHIDestReg, Instr);
@@ -1448,9 +1453,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
MachineBasicBlock *SourceMBB,
SmallVector<unsigned, 2> &PHIIndices,
unsigned *ReplaceReg) {
- DEBUG(dbgs() << "Shrink PHI: ");
- DEBUG(PHI.dump());
- DEBUG(dbgs() << " to " << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
+ LLVM_DEBUG(dbgs() << "Shrink PHI: ");
+ LLVM_DEBUG(PHI.dump());
+ LLVM_DEBUG(dbgs() << " to " << printReg(getPHIDestReg(PHI), TRI)
+ << " = PHI(");
bool Replaced = false;
unsigned NumInputs = getPHINumInputs(PHI);
@@ -1480,8 +1486,8 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
if (SourceMBB) {
MIB.addReg(CombinedSourceReg);
MIB.addMBB(SourceMBB);
- DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
- << printMBBReference(*SourceMBB));
+ LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+ << printMBBReference(*SourceMBB));
}
for (unsigned i = 0; i < NumInputs; ++i) {
@@ -1492,10 +1498,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
MIB.addReg(SourceReg);
MIB.addMBB(SourcePred);
- DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
- << printMBBReference(*SourcePred));
+ LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+ << printMBBReference(*SourcePred));
}
- DEBUG(dbgs() << ")\n");
+ LLVM_DEBUG(dbgs() << ")\n");
}
PHI.eraseFromParent();
return Replaced;
@@ -1504,9 +1510,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
void AMDGPUMachineCFGStructurizer::replacePHI(
MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge,
SmallVector<unsigned, 2> &PHIRegionIndices) {
- DEBUG(dbgs() << "Replace PHI: ");
- DEBUG(PHI.dump());
- DEBUG(dbgs() << " with " << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
+ LLVM_DEBUG(dbgs() << "Replace PHI: ");
+ LLVM_DEBUG(PHI.dump());
+ LLVM_DEBUG(dbgs() << " with " << printReg(getPHIDestReg(PHI), TRI)
+ << " = PHI(");
bool HasExternalEdge = false;
unsigned NumInputs = getPHINumInputs(PHI);
@@ -1523,8 +1530,8 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
getPHIDestReg(PHI));
MIB.addReg(CombinedSourceReg);
MIB.addMBB(LastMerge);
- DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
- << printMBBReference(*LastMerge));
+ LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+ << printMBBReference(*LastMerge));
for (unsigned i = 0; i < NumInputs; ++i) {
if (isPHIRegionIndex(PHIRegionIndices, i)) {
continue;
@@ -1533,10 +1540,10 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
MIB.addReg(SourceReg);
MIB.addMBB(SourcePred);
- DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
- << printMBBReference(*SourcePred));
+ LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+ << printMBBReference(*SourcePred));
}
- DEBUG(dbgs() << ")\n");
+ LLVM_DEBUG(dbgs() << ")\n");
} else {
replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg);
}
@@ -1546,9 +1553,9 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB,
SmallVector<unsigned, 2> &PHIRegionIndices) {
- DEBUG(dbgs() << "Replace entry PHI: ");
- DEBUG(PHI.dump());
- DEBUG(dbgs() << " with ");
+ LLVM_DEBUG(dbgs() << "Replace entry PHI: ");
+ LLVM_DEBUG(PHI.dump());
+ LLVM_DEBUG(dbgs() << " with ");
unsigned NumInputs = getPHINumInputs(PHI);
unsigned NumNonRegionInputs = NumInputs;
@@ -1561,18 +1568,19 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
if (NumNonRegionInputs == 0) {
auto DestReg = getPHIDestReg(PHI);
replaceRegisterWith(DestReg, CombinedSourceReg);
- DEBUG(dbgs() << " register " << printReg(CombinedSourceReg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << " register " << printReg(CombinedSourceReg, TRI)
+ << "\n");
PHI.eraseFromParent();
} else {
- DEBUG(dbgs() << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
+ LLVM_DEBUG(dbgs() << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
MachineBasicBlock *MBB = PHI.getParent();
MachineInstrBuilder MIB =
BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
getPHIDestReg(PHI));
MIB.addReg(CombinedSourceReg);
MIB.addMBB(IfMBB);
- DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
- << printMBBReference(*IfMBB));
+ LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+ << printMBBReference(*IfMBB));
unsigned NumInputs = getPHINumInputs(PHI);
for (unsigned i = 0; i < NumInputs; ++i) {
if (isPHIRegionIndex(PHIRegionIndices, i)) {
@@ -1582,10 +1590,10 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
MIB.addReg(SourceReg);
MIB.addMBB(SourcePred);
- DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
- << printMBBReference(*SourcePred));
+ LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+ << printMBBReference(*SourcePred));
}
- DEBUG(dbgs() << ")\n");
+ LLVM_DEBUG(dbgs() << ")\n");
PHI.eraseFromParent();
}
}
@@ -1607,8 +1615,9 @@ void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs(
}
}
- DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is "
- << (IsDead ? "dead" : "alive") << " after PHI replace\n");
+ LLVM_DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is "
+ << (IsDead ? "dead" : "alive")
+ << " after PHI replace\n");
if (IsDead) {
LRegion->removeLiveOut(Reg);
}
@@ -1682,8 +1691,8 @@ void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Regi
void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB,
MachineBasicBlock *Dest,
const DebugLoc &DL) {
- DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber()
- << " -> " << Dest->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber()
+ << " -> " << Dest->getNumber() << "\n");
MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator();
bool HasTerminator = Terminator != MBB->instr_end();
if (HasTerminator) {
@@ -1732,7 +1741,8 @@ AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) {
MF->insert(ExitIter, LastMerge);
LastMerge->addSuccessor(Exit);
insertUnconditionalBranch(LastMerge, Exit);
- DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber()
+ << "\n");
}
return LastMerge;
}
@@ -1748,11 +1758,12 @@ void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB,
if (MergeBB->succ_begin() == MergeBB->succ_end()) {
return;
}
- DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB)
- << "): " << printReg(DestRegister, TRI) << " = PHI("
- << printReg(IfSourceRegister, TRI) << ", "
- << printMBBReference(*IfBB) << printReg(CodeSourceRegister, TRI)
- << ", " << printMBBReference(*CodeBB) << ")\n");
+ LLVM_DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB)
+ << "): " << printReg(DestRegister, TRI) << " = PHI("
+ << printReg(IfSourceRegister, TRI) << ", "
+ << printMBBReference(*IfBB)
+ << printReg(CodeSourceRegister, TRI) << ", "
+ << printMBBReference(*CodeBB) << ")\n");
const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin());
MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL,
TII->get(TargetOpcode::PHI), DestRegister);
@@ -1810,8 +1821,8 @@ static void removeExternalCFGEdges(MachineBasicBlock *StartMBB,
for (auto SI : Succs) {
std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI;
- DEBUG(dbgs() << "Removing edge: " << printMBBReference(*Edge.first)
- << " -> " << printMBBReference(*Edge.second) << "\n");
+ LLVM_DEBUG(dbgs() << "Removing edge: " << printMBBReference(*Edge.first)
+ << " -> " << printMBBReference(*Edge.second) << "\n");
Edge.first->removeSuccessor(Edge.second);
}
}
@@ -1844,13 +1855,13 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
IfBB->addSuccessor(MergeBB);
IfBB->addSuccessor(CodeBBStart);
- DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
// Ensure that the MergeBB is a successor of the CodeEndBB.
if (!CodeBBEnd->isSuccessor(MergeBB))
CodeBBEnd->addSuccessor(MergeBB);
- DEBUG(dbgs() << "Moved " << printMBBReference(*CodeBBStart) << " through "
- << printMBBReference(*CodeBBEnd) << "\n");
+ LLVM_DEBUG(dbgs() << "Moved " << printMBBReference(*CodeBBStart)
+ << " through " << printMBBReference(*CodeBBEnd) << "\n");
// If we have a single predecessor we can find a reasonable debug location
MachineBasicBlock *SinglePred =
@@ -1935,16 +1946,18 @@ void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *Co
MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) {
if (MRI->def_begin(Reg) == MRI->def_end()) {
- DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo())
- << " has NO defs\n");
+ LLVM_DEBUG(dbgs() << "Register "
+ << printReg(Reg, MRI->getTargetRegisterInfo())
+ << " has NO defs\n");
} else if (!MRI->hasOneDef(Reg)) {
- DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo())
- << " has multiple defs\n");
- DEBUG(dbgs() << "DEFS BEGIN:\n");
+ LLVM_DEBUG(dbgs() << "Register "
+ << printReg(Reg, MRI->getTargetRegisterInfo())
+ << " has multiple defs\n");
+ LLVM_DEBUG(dbgs() << "DEFS BEGIN:\n");
for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) {
- DEBUG(DI->getParent()->dump());
+ LLVM_DEBUG(DI->getParent()->dump());
}
- DEBUG(dbgs() << "DEFS END\n");
+ LLVM_DEBUG(dbgs() << "DEFS END\n");
}
assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
@@ -1986,7 +1999,7 @@ void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB,
const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg);
unsigned NextDestReg = MRI->createVirtualRegister(RegClass);
bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1;
- DEBUG(dbgs() << "Insert Chained PHI\n");
+ LLVM_DEBUG(dbgs() << "Insert Chained PHI\n");
insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg,
SourceReg, IsLastDef);
@@ -2022,16 +2035,16 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
}
for (auto LI : OldLiveOuts) {
- DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI));
+ LLVM_DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI));
if (!containsDef(CodeBB, InnerRegion, LI) ||
(!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) {
// If the register simly lives through the CodeBB, we don't have
// to rewrite anything since the register is not defined in this
// part of the code.
- DEBUG(dbgs() << "- through");
+ LLVM_DEBUG(dbgs() << "- through");
continue;
}
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "\n");
unsigned Reg = LI;
if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) {
// If the register is live out, we do want to create a phi,
@@ -2048,12 +2061,12 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
unsigned IfSourceReg = MRI->createVirtualRegister(RegClass);
// Create initializer, this value is never used, but is needed
// to satisfy SSA.
- DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n");
+ LLVM_DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n");
TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(),
IfSourceReg, 0);
InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI);
- DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n");
+ LLVM_DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n");
insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg,
IfSourceReg, Reg, true);
}
@@ -2063,22 +2076,22 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
// is a source block for a definition.
SmallVector<unsigned, 4> Sources;
if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) {
- DEBUG(dbgs() << "Inserting PHI Live Out from " << printMBBReference(*CodeBB)
- << "\n");
+ LLVM_DEBUG(dbgs() << "Inserting PHI Live Out from "
+ << printMBBReference(*CodeBB) << "\n");
for (auto SI : Sources) {
unsigned DestReg;
PHIInfo.findDest(SI, CodeBB, DestReg);
insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI);
}
- DEBUG(dbgs() << "Insertion done.\n");
+ LLVM_DEBUG(dbgs() << "Insertion done.\n");
}
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(PHIInfo.dump(MRI));
}
void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
- DEBUG(dbgs() << "Before PHI Prune\n");
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(dbgs() << "Before PHI Prune\n");
+ LLVM_DEBUG(PHIInfo.dump(MRI));
SmallVector<std::tuple<unsigned, unsigned, MachineBasicBlock *>, 4>
ElimiatedSources;
for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
@@ -2118,8 +2131,8 @@ void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo),
std::get<2>(SourceInfo));
}
- DEBUG(dbgs() << "After PHI Prune\n");
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(dbgs() << "After PHI Prune\n");
+ LLVM_DEBUG(PHIInfo.dump(MRI));
}
void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion,
@@ -2127,8 +2140,8 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
MachineBasicBlock *Entry = CurrentRegion->getEntry();
MachineBasicBlock *Exit = CurrentRegion->getExit();
- DEBUG(dbgs() << "RegionExit: " << Exit->getNumber()
- << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "RegionExit: " << Exit->getNumber() << " Pred: "
+ << (*(Entry->pred_begin()))->getNumber() << "\n");
int NumSources = 0;
auto SE = PHIInfo.sources_end(DestReg);
@@ -2145,7 +2158,7 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
const DebugLoc &DL = Entry->findDebugLoc(Entry->begin());
MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL,
TII->get(TargetOpcode::PHI), DestReg);
- DEBUG(dbgs() << "Entry PHI " << printReg(DestReg, TRI) << " = PHI(");
+ LLVM_DEBUG(dbgs() << "Entry PHI " << printReg(DestReg, TRI) << " = PHI(");
unsigned CurrentBackedgeReg = 0;
@@ -2169,19 +2182,19 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1));
BackedgePHI.addMBB((*SRI).second);
CurrentBackedgeReg = NewBackedgeReg;
- DEBUG(dbgs() << "Inserting backedge PHI: "
- << printReg(NewBackedgeReg, TRI) << " = PHI("
- << printReg(CurrentBackedgeReg, TRI) << ", "
- << printMBBReference(*getPHIPred(*PHIDefInstr, 0))
- << ", "
- << printReg(getPHISourceReg(*PHIDefInstr, 1), TRI)
- << ", " << printMBBReference(*(*SRI).second));
+ LLVM_DEBUG(dbgs()
+ << "Inserting backedge PHI: "
+ << printReg(NewBackedgeReg, TRI) << " = PHI("
+ << printReg(CurrentBackedgeReg, TRI) << ", "
+ << printMBBReference(*getPHIPred(*PHIDefInstr, 0)) << ", "
+ << printReg(getPHISourceReg(*PHIDefInstr, 1), TRI) << ", "
+ << printMBBReference(*(*SRI).second));
}
} else {
MIB.addReg(SourceReg);
MIB.addMBB((*SRI).second);
- DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
- << printMBBReference(*(*SRI).second) << ", ");
+ LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+ << printMBBReference(*(*SRI).second) << ", ");
}
}
@@ -2189,16 +2202,16 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
if (CurrentBackedgeReg != 0) {
MIB.addReg(CurrentBackedgeReg);
MIB.addMBB(Exit);
- DEBUG(dbgs() << printReg(CurrentBackedgeReg, TRI) << ", "
- << printMBBReference(*Exit) << ")\n");
+ LLVM_DEBUG(dbgs() << printReg(CurrentBackedgeReg, TRI) << ", "
+ << printMBBReference(*Exit) << ")\n");
} else {
- DEBUG(dbgs() << ")\n");
+ LLVM_DEBUG(dbgs() << ")\n");
}
}
}
void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) {
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(PHIInfo.dump(MRI));
for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
++DRI) {
@@ -2219,19 +2232,19 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
MachineOperand &O = *I;
++I;
if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
- DEBUG(dbgs() << "Trying to substitute physical register: "
- << printReg(NewRegister, MRI->getTargetRegisterInfo())
- << "\n");
+ LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
+ << printReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
llvm_unreachable("Cannot substitute physical registers");
// We don't handle physical registers, but if we need to
// in the future This is how we do it:
// O.substPhysReg(NewRegister, *TRI);
} else {
- DEBUG(dbgs() << "Replacing register: "
- << printReg(Register, MRI->getTargetRegisterInfo())
- << " with "
- << printReg(NewRegister, MRI->getTargetRegisterInfo())
- << "\n");
+ LLVM_DEBUG(dbgs() << "Replacing register: "
+ << printReg(Register, MRI->getTargetRegisterInfo())
+ << " with "
+ << printReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
O.setReg(NewRegister);
}
}
@@ -2239,20 +2252,20 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(PHIInfo.dump(MRI));
}
void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) {
- DEBUG(dbgs() << "Resolve PHI Infos\n");
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(dbgs() << "Resolve PHI Infos\n");
+ LLVM_DEBUG(PHIInfo.dump(MRI));
for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
++DRI) {
unsigned DestReg = *DRI;
- DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) << "\n");
auto SRI = PHIInfo.sources_begin(DestReg);
unsigned SourceReg = (*SRI).first;
- DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI)
- << " SourceReg: " << printReg(SourceReg, TRI) << "\n");
+ LLVM_DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI)
+ << " SourceReg: " << printReg(SourceReg, TRI) << "\n");
assert(PHIInfo.sources_end(DestReg) == ++SRI &&
"More than one phi source in entry node");
@@ -2326,9 +2339,9 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
MachineOperand RegOp =
MachineOperand::CreateReg(Reg, false, false, true);
ArrayRef<MachineOperand> Cond(RegOp);
- DEBUG(dbgs() << "RegionExitReg: ");
- DEBUG(Cond[0].print(dbgs(), TRI));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "RegionExitReg: ");
+ LLVM_DEBUG(Cond[0].print(dbgs(), TRI));
+ LLVM_DEBUG(dbgs() << "\n");
TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
Cond, DebugLoc());
RegionExit->addSuccessor(CurrentRegion->getEntry());
@@ -2338,12 +2351,12 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo);
InnerRegion.setParent(CurrentRegion);
- DEBUG(dbgs() << "Insert BB Select PHI (BB)\n");
+ LLVM_DEBUG(dbgs() << "Insert BB Select PHI (BB)\n");
insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
CodeBBSelectReg);
InnerRegion.addMBB(MergeBB);
- DEBUG(InnerRegion.print(dbgs(), TRI));
+ LLVM_DEBUG(InnerRegion.print(dbgs(), TRI));
rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion);
extractKilledPHIs(CodeBB);
if (IsRegionEntryBB) {
@@ -2384,16 +2397,16 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
CurrentRegion->getRegionMRT()->getEntry()->getNumber());
MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
ArrayRef<MachineOperand> Cond(RegOp);
- DEBUG(dbgs() << "RegionExitReg: ");
- DEBUG(Cond[0].print(dbgs(), TRI));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "RegionExitReg: ");
+ LLVM_DEBUG(Cond[0].print(dbgs(), TRI));
+ LLVM_DEBUG(dbgs() << "\n");
TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
Cond, DebugLoc());
RegionExit->addSuccessor(IfBB);
}
}
CurrentRegion->addMBBs(InnerRegion);
- DEBUG(dbgs() << "Insert BB Select PHI (region)\n");
+ LLVM_DEBUG(dbgs() << "Insert BB Select PHI (region)\n");
insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
CodeBBSelectReg);
@@ -2439,15 +2452,16 @@ void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI,
MachineInstrBuilder MIB =
BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(),
TII->get(TargetOpcode::PHI), NewDestReg);
- DEBUG(dbgs() << "Split Entry PHI " << printReg(NewDestReg, TRI) << " = PHI(");
+ LLVM_DEBUG(dbgs() << "Split Entry PHI " << printReg(NewDestReg, TRI)
+ << " = PHI(");
MIB.addReg(PHISource);
MIB.addMBB(Entry);
- DEBUG(dbgs() << printReg(PHISource, TRI) << ", "
- << printMBBReference(*Entry));
+ LLVM_DEBUG(dbgs() << printReg(PHISource, TRI) << ", "
+ << printMBBReference(*Entry));
MIB.addReg(RegionSourceReg);
MIB.addMBB(RegionSourceMBB);
- DEBUG(dbgs() << " ," << printReg(RegionSourceReg, TRI) << ", "
- << printMBBReference(*RegionSourceMBB) << ")\n");
+ LLVM_DEBUG(dbgs() << " ," << printReg(RegionSourceReg, TRI) << ", "
+ << printMBBReference(*RegionSourceMBB) << ")\n");
}
void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry,
@@ -2480,7 +2494,8 @@ AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) {
LRegion->addMBB(NewExit);
LRegion->setExit(NewExit);
- DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber()
+ << "\n");
// Replace any PHI Predecessors in the successor with NewExit
for (auto &II : *Succ) {
@@ -2528,9 +2543,9 @@ AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) {
MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI());
MachineBasicBlock *Exit = LRegion->getExit();
- DEBUG(dbgs() << "Split " << printMBBReference(*Entry) << " to "
- << printMBBReference(*Entry) << " -> "
- << printMBBReference(*EntrySucc) << "\n");
+ LLVM_DEBUG(dbgs() << "Split " << printMBBReference(*Entry) << " to "
+ << printMBBReference(*Entry) << " -> "
+ << printMBBReference(*EntrySucc) << "\n");
LRegion->addMBB(EntrySucc);
// Make the backedge go to Entry Succ
@@ -2621,21 +2636,21 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
rewriteRegionExitPHIs(Region, LastMerge, LRegion);
removeOldExitPreds(Region);
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(PHIInfo.dump(MRI));
SetVector<MRT *> *Children = Region->getChildren();
- DEBUG(dbgs() << "===========If Region Start===============\n");
+ LLVM_DEBUG(dbgs() << "===========If Region Start===============\n");
if (LRegion->getHasLoop()) {
- DEBUG(dbgs() << "Has Backedge: Yes\n");
+ LLVM_DEBUG(dbgs() << "Has Backedge: Yes\n");
} else {
- DEBUG(dbgs() << "Has Backedge: No\n");
+ LLVM_DEBUG(dbgs() << "Has Backedge: No\n");
}
unsigned BBSelectRegIn;
unsigned BBSelectRegOut;
for (auto CI = Children->begin(), CE = Children->end(); CI != CE; ++CI) {
- DEBUG(dbgs() << "CurrentRegion: \n");
- DEBUG(LRegion->print(dbgs(), TRI));
+ LLVM_DEBUG(dbgs() << "CurrentRegion: \n");
+ LLVM_DEBUG(LRegion->print(dbgs(), TRI));
auto CNI = CI;
++CNI;
@@ -2649,9 +2664,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
// We found the block is the exit of an inner region, we need
// to put it in the current linearized region.
- DEBUG(dbgs() << "Linearizing region: ");
- DEBUG(InnerLRegion->print(dbgs(), TRI));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Linearizing region: ");
+ LLVM_DEBUG(InnerLRegion->print(dbgs(), TRI));
+ LLVM_DEBUG(dbgs() << "\n");
MachineBasicBlock *InnerEntry = InnerLRegion->getEntry();
if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) {
@@ -2669,10 +2684,10 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
BBSelectRegOut = Child->getBBSelectRegOut();
BBSelectRegIn = Child->getBBSelectRegIn();
- DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
- << "\n");
- DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
- << "\n");
+ LLVM_DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
+ << "\n");
+ LLVM_DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
+ << "\n");
MachineBasicBlock *IfEnd = CurrentMerge;
CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion,
@@ -2681,7 +2696,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
} else {
MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB();
- DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n");
if (MBB == getSingleExitNode(*(MBB->getParent()))) {
// If this is the exit block then we need to skip to the next.
@@ -2693,10 +2708,10 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
BBSelectRegOut = Child->getBBSelectRegOut();
BBSelectRegIn = Child->getBBSelectRegIn();
- DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
- << "\n");
- DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
- << "\n");
+ LLVM_DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
+ << "\n");
+ LLVM_DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
+ << "\n");
MachineBasicBlock *IfEnd = CurrentMerge;
// This is a basic block that is not part of an inner region, we
@@ -2707,7 +2722,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
}
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(PHIInfo.dump(MRI));
}
}
@@ -2728,7 +2743,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
NewInReg, Region->getEntry()->getNumber());
// Need to be careful about updating the registers inside the region.
LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI);
- DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
+ LLVM_DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc,
InnerSelectReg, NewInReg,
LRegion->getRegionMRT()->getInnerOutputRegister());
@@ -2740,11 +2755,11 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
TII->insertReturn(*LastMerge);
}
- DEBUG(Region->getEntry()->getParent()->dump());
- DEBUG(LRegion->print(dbgs(), TRI));
- DEBUG(PHIInfo.dump(MRI));
+ LLVM_DEBUG(Region->getEntry()->getParent()->dump());
+ LLVM_DEBUG(LRegion->print(dbgs(), TRI));
+ LLVM_DEBUG(PHIInfo.dump(MRI));
- DEBUG(dbgs() << "===========If Region End===============\n");
+ LLVM_DEBUG(dbgs() << "===========If Region End===============\n");
Region->setLinearizedRegion(LRegion);
return true;
@@ -2784,12 +2799,12 @@ bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region,
}
void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) {
- DEBUG(dbgs() << "Fallthrough Map:\n");
+ LLVM_DEBUG(dbgs() << "Fallthrough Map:\n");
for (auto &MBBI : MF) {
MachineBasicBlock *MBB = MBBI.getFallThrough();
if (MBB != nullptr) {
- DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> "
- << MBB->getNumber() << "\n");
+ LLVM_DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> "
+ << MBB->getNumber() << "\n");
}
FallthroughMap[&MBBI] = MBB;
}
@@ -2800,8 +2815,8 @@ void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region,
LinearizedRegion *LRegion = new LinearizedRegion();
if (SelectOut) {
LRegion->addLiveOut(SelectOut);
- DEBUG(dbgs() << "Add LiveOut (BBSelect): " << printReg(SelectOut, TRI)
- << "\n");
+ LLVM_DEBUG(dbgs() << "Add LiveOut (BBSelect): " << printReg(SelectOut, TRI)
+ << "\n");
}
LRegion->setRegionMRT(Region);
Region->setLinearizedRegion(LRegion);
@@ -2856,26 +2871,26 @@ static void checkRegOnlyPHIInputs(MachineFunction &MF) {
}
bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
TRI = ST.getRegisterInfo();
MRI = &(MF.getRegInfo());
initFallthroughMap(MF);
checkRegOnlyPHIInputs(MF);
- DEBUG(dbgs() << "----STRUCTURIZER START----\n");
- DEBUG(MF.dump());
+ LLVM_DEBUG(dbgs() << "----STRUCTURIZER START----\n");
+ LLVM_DEBUG(MF.dump());
Regions = &(getAnalysis<MachineRegionInfoPass>().getRegionInfo());
- DEBUG(Regions->dump());
+ LLVM_DEBUG(Regions->dump());
RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI);
setRegionMRT(RTree);
initializeSelectRegisters(RTree, 0, MRI, TII);
- DEBUG(RTree->dump(TRI));
+ LLVM_DEBUG(RTree->dump(TRI));
bool result = structurizeRegions(RTree, true);
delete RTree;
- DEBUG(dbgs() << "----STRUCTURIZER END----\n");
+ LLVM_DEBUG(dbgs() << "----STRUCTURIZER END----\n");
initFallthroughMap(MF);
return result;
}
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index b7c8c1213537..13b4b50149ce 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -9,20 +9,38 @@
#include "AMDGPUMachineFunction.h"
#include "AMDGPUSubtarget.h"
+#include "AMDGPUPerfHintAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
using namespace llvm;
AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MachineFunctionInfo(),
LocalMemoryObjects(),
- KernArgSize(0),
+ ExplicitKernArgSize(0),
MaxKernArgAlign(0),
LDSSize(0),
- ABIArgOffset(0),
IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
- NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
+ NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
+ MemoryBound(false),
+ WaveLimiter(false) {
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
+
// FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
// except reserved size is not correctly aligned.
+ const Function &F = MF.getFunction();
+
+ if (auto *Resolver = MF.getMMI().getResolver()) {
+ if (AMDGPUPerfHintAnalysis *PHA = static_cast<AMDGPUPerfHintAnalysis*>(
+ Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) {
+ MemoryBound = PHA->isMemoryBound(&F);
+ WaveLimiter = PHA->needsWaveLimiter(&F);
+ }
+ }
+
+ CallingConv::ID CC = F.getCallingConv();
+ if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
+ ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
}
unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 99bb61b21db0..8d6b871bc03e 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -15,57 +15,43 @@
namespace llvm {
+class GCNSubtarget;
+
class AMDGPUMachineFunction : public MachineFunctionInfo {
/// A map to keep track of local memory objects and their offsets within the
/// local memory space.
SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;
- uint64_t KernArgSize;
- unsigned MaxKernArgAlign;
+protected:
+ uint64_t ExplicitKernArgSize; // Cache for this.
+ unsigned MaxKernArgAlign; // Cache for this.
/// Number of bytes in the LDS that are being used.
unsigned LDSSize;
- // FIXME: This should probably be removed.
- /// Start of implicit kernel args
- unsigned ABIArgOffset;
-
- // Kernels + shaders. i.e. functions called by the driver and not not called
+ // Kernels + shaders. i.e. functions called by the driver and not called
// by other functions.
bool IsEntryFunction;
bool NoSignedZerosFPMath;
-public:
- AMDGPUMachineFunction(const MachineFunction &MF);
-
- uint64_t allocateKernArg(uint64_t Size, unsigned Align) {
- assert(isPowerOf2_32(Align));
- KernArgSize = alignTo(KernArgSize, Align);
+ // Function may be memory bound.
+ bool MemoryBound;
- uint64_t Result = KernArgSize;
- KernArgSize += Size;
+ // Kernel may need limited waves per EU for better performance.
+ bool WaveLimiter;
- MaxKernArgAlign = std::max(Align, MaxKernArgAlign);
- return Result;
- }
+public:
+ AMDGPUMachineFunction(const MachineFunction &MF);
- uint64_t getKernArgSize() const {
- return KernArgSize;
+ uint64_t getExplicitKernArgSize() const {
+ return ExplicitKernArgSize;
}
unsigned getMaxKernArgAlign() const {
return MaxKernArgAlign;
}
- void setABIArgOffset(unsigned NewOffset) {
- ABIArgOffset = NewOffset;
- }
-
- unsigned getABIArgOffset() const {
- return ABIArgOffset;
- }
-
unsigned getLDSSize() const {
return LDSSize;
}
@@ -78,6 +64,14 @@ public:
return NoSignedZerosFPMath;
}
+ bool isMemoryBound() const {
+ return MemoryBound;
+ }
+
+ bool needsWaveLimiter() const {
+ return WaveLimiter;
+ }
+
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV);
};
diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index 3164140abe29..7b9f673c418c 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU Machine Module Info.
+/// AMDGPU Machine Module Info.
///
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index 1a728c6bd04a..1219ab26fb69 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU Machine Module Info.
+/// AMDGPU Machine Module Info.
///
//
//===----------------------------------------------------------------------===//
@@ -30,14 +30,14 @@ private:
// All supported memory/synchronization scopes can be found here:
// http://llvm.org/docs/AMDGPUUsage.html#memory-scopes
- /// \brief Agent synchronization scope ID.
+ /// Agent synchronization scope ID.
SyncScope::ID AgentSSID;
- /// \brief Workgroup synchronization scope ID.
+ /// Workgroup synchronization scope ID.
SyncScope::ID WorkgroupSSID;
- /// \brief Wavefront synchronization scope ID.
+ /// Wavefront synchronization scope ID.
SyncScope::ID WavefrontSSID;
- /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a
+ /// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
/// scope.
///
@@ -74,7 +74,7 @@ public:
return WavefrontSSID;
}
- /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a
+ /// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
/// scope.
///
diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index 7263ba73d155..995d9ae3907f 100644
--- a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -15,6 +15,7 @@
#include "AMDGPUMacroFusion.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MacroFusion.h"
@@ -22,7 +23,7 @@ using namespace llvm;
namespace {
-/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
/// together. Given SecondMI, when FirstMI is unspecified, then check if
/// SecondMI may be part of a fused pair at all.
static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index bb65636f15af..7bd8533a0ccf 100644
--- a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
// \file
-// \brief This post-linking pass replaces the function pointer of enqueued
+// This post-linking pass replaces the function pointer of enqueued
// block kernel with a global variable (runtime handle) and adds
// "runtime-handle" attribute to the enqueued block kernel.
//
@@ -36,7 +36,9 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Mangler.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/User.h"
#include "llvm/Pass.h"
@@ -49,7 +51,7 @@ using namespace llvm;
namespace {
-/// \brief Lower enqueued blocks.
+/// Lower enqueued blocks.
class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
public:
static char ID;
@@ -80,49 +82,63 @@ static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
for (auto U : F->users()) {
if (auto *CI = dyn_cast<CallInst>(&*U)) {
auto *Caller = CI->getParent()->getParent();
- if (Callers.count(Caller))
- continue;
- Callers.insert(Caller);
- collectCallers(Caller, Callers);
+ if (Callers.insert(Caller).second)
+ collectCallers(Caller, Callers);
}
}
}
+/// If \p U is instruction or constant, collect functions which directly or
+/// indirectly use it.
+static void collectFunctionUsers(User *U, DenseSet<Function *> &Funcs) {
+ if (auto *I = dyn_cast<Instruction>(U)) {
+ auto *F = I->getParent()->getParent();
+ if (Funcs.insert(F).second)
+ collectCallers(F, Funcs);
+ return;
+ }
+ if (!isa<Constant>(U))
+ return;
+ for (auto UU : U->users())
+ collectFunctionUsers(&*UU, Funcs);
+}
+
bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
DenseSet<Function *> Callers;
auto &C = M.getContext();
bool Changed = false;
for (auto &F : M.functions()) {
if (F.hasFnAttribute("enqueued-block")) {
- if (!F.hasOneUse() || !F.user_begin()->hasOneUse() ||
- !isa<ConstantExpr>(*F.user_begin()) ||
- !isa<ConstantExpr>(*F.user_begin()->user_begin())) {
- continue;
+ if (!F.hasName()) {
+ SmallString<64> Name;
+ Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel",
+ M.getDataLayout());
+ F.setName(Name);
}
- auto *BitCast = cast<ConstantExpr>(*F.user_begin());
- auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin());
- auto RuntimeHandle = (F.getName() + "_runtime_handle").str();
+ LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n');
+ auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
+ auto T = ArrayType::get(Type::getInt64Ty(C), 2);
auto *GV = new GlobalVariable(
- M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS),
- /*IsConstant=*/true, GlobalValue::ExternalLinkage,
- /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr,
- GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS,
- /*IsExternallyInitialized=*/true);
- DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
- auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType());
- AddrCast->replaceAllUsesWith(NewPtr);
- F.addFnAttr("runtime-handle", RuntimeHandle);
- F.setLinkage(GlobalValue::ExternalLinkage);
-
- // Collect direct or indirect callers of enqueue_kernel.
- for (auto U : NewPtr->users()) {
- if (auto *I = dyn_cast<Instruction>(&*U)) {
- auto *F = I->getParent()->getParent();
- Callers.insert(F);
- collectCallers(F, Callers);
- }
+ M, T,
+ /*IsConstant=*/false, GlobalValue::ExternalLinkage,
+ /*Initializer=*/Constant::getNullValue(T), RuntimeHandle,
+ /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
+ AMDGPUAS::GLOBAL_ADDRESS,
+ /*IsExternallyInitialized=*/false);
+ LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
+
+ for (auto U : F.users()) {
+ auto *UU = &*U;
+ if (!isa<ConstantExpr>(UU))
+ continue;
+ collectFunctionUsers(UU, Callers);
+ auto *BitCast = cast<ConstantExpr>(UU);
+ auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType());
+ BitCast->replaceAllUsesWith(NewPtr);
+ F.addFnAttr("runtime-handle", RuntimeHandle);
+ F.setLinkage(GlobalValue::ExternalLinkage);
+ Changed = true;
}
- Changed = true;
}
}
@@ -130,6 +146,7 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
continue;
F->addFnAttr("calls-enqueue-kernel");
+ LLVM_DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n');
}
return Changed;
}
diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
new file mode 100644
index 000000000000..3cfdccc9fe51
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -0,0 +1,397 @@
+//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes if a function potentially memory bound and if a kernel
+/// kernel may benefit from limiting number of waves to reduce cache thrashing.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUPerfHintAnalysis.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-perf-hint"
+
+static cl::opt<unsigned>
+ MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
+ cl::desc("Function mem bound threshold in %"));
+
+static cl::opt<unsigned>
+ LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
+ cl::desc("Kernel limit wave threshold in %"));
+
+static cl::opt<unsigned>
+ IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
+ cl::desc("Indirect access memory instruction weight"));
+
+static cl::opt<unsigned>
+ LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
+ cl::desc("Large stride memory access weight"));
+
+static cl::opt<unsigned>
+ LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
+ cl::desc("Large stride memory access threshold"));
+
+STATISTIC(NumMemBound, "Number of functions marked as memory bound");
+STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
+
+char llvm::AMDGPUPerfHintAnalysis::ID = 0;
+char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID;
+
+INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE,
+ "Analysis if a function is memory bound", true, true)
+
+namespace {
+
+struct AMDGPUPerfHint {
+ friend AMDGPUPerfHintAnalysis;
+
+public:
+ AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
+ const TargetLowering *TLI_)
+ : FIM(FIM_), DL(nullptr), TLI(TLI_) {}
+
+ void runOnFunction(Function &F);
+
+private:
+ struct MemAccessInfo {
+ const Value *V;
+ const Value *Base;
+ int64_t Offset;
+ MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {}
+ bool isLargeStride(MemAccessInfo &Reference) const;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ Printable print() const {
+ return Printable([this](raw_ostream &OS) {
+ OS << "Value: " << *V << '\n'
+ << "Base: " << *Base << " Offset: " << Offset << '\n';
+ });
+ }
+#endif
+ };
+
+ MemAccessInfo makeMemAccessInfo(Instruction *) const;
+
+ MemAccessInfo LastAccess; // Last memory access info
+
+ AMDGPUPerfHintAnalysis::FuncInfoMap &FIM;
+
+ const DataLayout *DL;
+
+ AMDGPUAS AS;
+
+ const TargetLowering *TLI;
+
+ void visit(const Function &F);
+ static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
+ static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
+
+ bool isIndirectAccess(const Instruction *Inst) const;
+
+ /// Check if the instruction is large stride.
+ /// The purpose is to identify memory access pattern like:
+ /// x = a[i];
+ /// y = a[i+1000];
+ /// z = a[i+2000];
+ /// In the above example, the second and third memory access will be marked
+ /// large stride memory access.
+ bool isLargeStride(const Instruction *Inst);
+
+ bool isGlobalAddr(const Value *V) const;
+ bool isLocalAddr(const Value *V) const;
+ bool isConstantAddr(const Value *V) const;
+};
+
+static const Value *getMemoryInstrPtr(const Instruction *Inst) {
+ if (auto LI = dyn_cast<LoadInst>(Inst)) {
+ return LI->getPointerOperand();
+ }
+ if (auto SI = dyn_cast<StoreInst>(Inst)) {
+ return SI->getPointerOperand();
+ }
+ if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+ return AI->getPointerOperand();
+ }
+ if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
+ return AI->getPointerOperand();
+ }
+ if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
+ return MI->getRawDest();
+ }
+
+ return nullptr;
+}
+
+bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
+ LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
+ SmallSet<const Value *, 32> WorkSet;
+ SmallSet<const Value *, 32> Visited;
+ if (const Value *MO = getMemoryInstrPtr(Inst)) {
+ if (isGlobalAddr(MO))
+ WorkSet.insert(MO);
+ }
+
+ while (!WorkSet.empty()) {
+ const Value *V = *WorkSet.begin();
+ WorkSet.erase(*WorkSet.begin());
+ if (!Visited.insert(V).second)
+ continue;
+ LLVM_DEBUG(dbgs() << " check: " << *V << '\n');
+
+ if (auto LD = dyn_cast<LoadInst>(V)) {
+ auto M = LD->getPointerOperand();
+ if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) {
+ LLVM_DEBUG(dbgs() << " is IA\n");
+ return true;
+ }
+ continue;
+ }
+
+ if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
+ auto P = GEP->getPointerOperand();
+ WorkSet.insert(P);
+ for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
+ WorkSet.insert(GEP->getOperand(I));
+ continue;
+ }
+
+ if (auto U = dyn_cast<UnaryInstruction>(V)) {
+ WorkSet.insert(U->getOperand(0));
+ continue;
+ }
+
+ if (auto BO = dyn_cast<BinaryOperator>(V)) {
+ WorkSet.insert(BO->getOperand(0));
+ WorkSet.insert(BO->getOperand(1));
+ continue;
+ }
+
+ if (auto S = dyn_cast<SelectInst>(V)) {
+ WorkSet.insert(S->getFalseValue());
+ WorkSet.insert(S->getTrueValue());
+ continue;
+ }
+
+ if (auto E = dyn_cast<ExtractElementInst>(V)) {
+ WorkSet.insert(E->getVectorOperand());
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << " dropped\n");
+ }
+
+ LLVM_DEBUG(dbgs() << " is not IA\n");
+ return false;
+}
+
+void AMDGPUPerfHint::visit(const Function &F) {
+ auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo()));
+ if (!FIP.second)
+ return;
+
+ AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second;
+
+ LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
+
+ for (auto &B : F) {
+ LastAccess = MemAccessInfo();
+ for (auto &I : B) {
+ if (getMemoryInstrPtr(&I)) {
+ if (isIndirectAccess(&I))
+ ++FI.IAMInstCount;
+ if (isLargeStride(&I))
+ ++FI.LSMInstCount;
+ ++FI.MemInstCount;
+ ++FI.InstCount;
+ continue;
+ }
+ CallSite CS(const_cast<Instruction *>(&I));
+ if (CS) {
+ Function *Callee = CS.getCalledFunction();
+ if (!Callee || Callee->isDeclaration()) {
+ ++FI.InstCount;
+ continue;
+ }
+ if (&F == Callee) // Handle immediate recursion
+ continue;
+
+ visit(*Callee);
+ auto Loc = FIM.find(Callee);
+
+ assert(Loc != FIM.end() && "No func info");
+ FI.MemInstCount += Loc->second.MemInstCount;
+ FI.InstCount += Loc->second.InstCount;
+ FI.IAMInstCount += Loc->second.IAMInstCount;
+ FI.LSMInstCount += Loc->second.LSMInstCount;
+ } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ TargetLoweringBase::AddrMode AM;
+ auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
+ AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
+ AM.HasBaseReg = !AM.BaseGV;
+ if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
+ GEP->getPointerAddressSpace()))
+ // Offset will likely be folded into load or store
+ continue;
+ ++FI.InstCount;
+ } else {
+ ++FI.InstCount;
+ }
+ }
+ }
+}
+
+void AMDGPUPerfHint::runOnFunction(Function &F) {
+ if (FIM.find(&F) != FIM.end())
+ return;
+
+ const Module &M = *F.getParent();
+ DL = &M.getDataLayout();
+ AS = AMDGPU::getAMDGPUAS(M);
+
+ visit(F);
+ auto Loc = FIM.find(&F);
+
+ assert(Loc != FIM.end() && "No func info");
+ LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount
+ << '\n'
+ << " IAMInst: " << Loc->second.IAMInstCount << '\n'
+ << " LSMInst: " << Loc->second.LSMInstCount << '\n'
+ << " TotalInst: " << Loc->second.InstCount << '\n');
+
+ auto &FI = Loc->second;
+
+ if (isMemBound(FI)) {
+ LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
+ NumMemBound++;
+ }
+
+ if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) {
+ LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
+ NumLimitWave++;
+ }
+}
+
+bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+ return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
+}
+
+bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+ return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
+ FI.LSMInstCount * LSWeight) *
+ 100 / FI.InstCount) > LimitWaveThresh;
+}
+
+bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
+ if (auto PT = dyn_cast<PointerType>(V->getType())) {
+ unsigned As = PT->getAddressSpace();
+ // Flat likely points to global too.
+ return As == AS.GLOBAL_ADDRESS || As == AS.FLAT_ADDRESS;
+ }
+ return false;
+}
+
+bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
+ if (auto PT = dyn_cast<PointerType>(V->getType()))
+ return PT->getAddressSpace() == AS.LOCAL_ADDRESS;
+ return false;
+}
+
+bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
+ LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
+
+ MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
+ bool IsLargeStride = MAI.isLargeStride(LastAccess);
+ if (MAI.Base)
+ LastAccess = std::move(MAI);
+
+ return IsLargeStride;
+}
+
+AMDGPUPerfHint::MemAccessInfo
+AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
+ MemAccessInfo MAI;
+ const Value *MO = getMemoryInstrPtr(Inst);
+
+ LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
+ // Do not treat local-addr memory access as large stride.
+ if (isLocalAddr(MO))
+ return MAI;
+
+ MAI.V = MO;
+ MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
+ return MAI;
+}
+
+bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
+ if (auto PT = dyn_cast<PointerType>(V->getType())) {
+ unsigned As = PT->getAddressSpace();
+ return As == AS.CONSTANT_ADDRESS || As == AS.CONSTANT_ADDRESS_32BIT;
+ }
+ return false;
+}
+
+bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
+ MemAccessInfo &Reference) const {
+
+ if (!Base || !Reference.Base || Base != Reference.Base)
+ return false;
+
+ uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
+ : Reference.Offset - Offset;
+ bool Result = Diff > LargeStrideThresh;
+ LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
+ << print() << "<=>\n"
+ << Reference.print() << "Result:" << Result << '\n');
+ return Result;
+}
+} // namespace
+
+bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) {
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ const TargetMachine &TM = TPC->getTM<TargetMachine>();
+ const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F);
+
+ AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
+ Analyzer.runOnFunction(F);
+ return false;
+}
+
+bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
+ auto FI = FIM.find(F);
+ if (FI == FIM.end())
+ return false;
+
+ return AMDGPUPerfHint::isMemBound(FI->second);
+}
+
+bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const {
+ auto FI = FIM.find(F);
+ if (FI == FIM.end())
+ return false;
+
+ return AMDGPUPerfHint::needLimitWave(FI->second);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
new file mode 100644
index 000000000000..be7f37cb6815
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -0,0 +1,55 @@
+//===- AMDGPUPerfHintAnalysis.h - analysis of functions memory traffic ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes if a function potentially memory bound and if a kernel
+/// kernel may benefit from limiting number of waves to reduce cache thrashing.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+#define LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+struct AMDGPUPerfHintAnalysis : public FunctionPass {
+ static char ID;
+
+public:
+ AMDGPUPerfHintAnalysis() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+
+ bool isMemoryBound(const Function *F) const;
+
+ bool needsWaveLimiter(const Function *F) const;
+
+ struct FuncInfo {
+ unsigned MemInstCount;
+ unsigned InstCount;
+ unsigned IAMInstCount; // Indirect access memory instruction count
+ unsigned LSMInstCount; // Large stride memory instruction count
+ FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0),
+ LSMInstCount(0) {}
+ };
+
+ typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
+
+private:
+
+ FuncInfoMap FIM;
+};
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 41876ed45c8c..d341fec6296f 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -65,6 +65,11 @@ using namespace llvm;
namespace {
+static cl::opt<bool> DisablePromoteAllocaToVector(
+ "disable-promote-alloca-to-vector",
+ cl::desc("Disable promote alloca to vector"),
+ cl::init(false));
+
// FIXME: This can create globals so should be a module pass.
class AMDGPUPromoteAlloca : public FunctionPass {
private:
@@ -147,7 +152,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
IsAMDGCN = TT.getArch() == Triple::amdgcn;
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
if (!ST.isPromoteAllocaEnabled())
return false;
@@ -169,8 +174,8 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
std::pair<Value *, Value *>
AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
- *Builder.GetInsertBlock()->getParent());
+ const Function &F = *Builder.GetInsertBlock()->getParent();
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
if (!IsAMDHSA) {
Function *LocalSizeYFn
@@ -256,8 +261,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
}
Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
- *Builder.GetInsertBlock()->getParent());
+ const AMDGPUSubtarget &ST =
+ AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
switch (N) {
@@ -318,18 +323,19 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
static bool canVectorizeInst(Instruction *Inst, User *User) {
switch (Inst->getOpcode()) {
case Instruction::Load: {
+ // Currently only handle the case where the Pointer Operand is a GEP.
+ // Also we could not vectorize volatile or atomic loads.
LoadInst *LI = cast<LoadInst>(Inst);
- // Currently only handle the case where the Pointer Operand is a GEP so check for that case.
- return isa<GetElementPtrInst>(LI->getPointerOperand()) && !LI->isVolatile();
+ return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
}
case Instruction::BitCast:
- case Instruction::AddrSpaceCast:
return true;
case Instruction::Store: {
// Must be the stored pointer operand, not a stored value, plus
// since it should be canonical form, the User should be a GEP.
+ // Also we could not vectorize volatile or atomic stores.
StoreInst *SI = cast<StoreInst>(Inst);
- return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && !SI->isVolatile();
+ return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
}
default:
return false;
@@ -337,19 +343,25 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
}
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
+
+ if (DisablePromoteAllocaToVector) {
+ LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
+ return false;
+ }
+
ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
- DEBUG(dbgs() << "Alloca candidate for vectorization\n");
+ LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
// FIXME: There is no reason why we can't support larger arrays, we
// are just being conservative for now.
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
// could also be promoted but we don't currently handle this case
if (!AllocaTy ||
- AllocaTy->getNumElements() > 4 ||
+ AllocaTy->getNumElements() > 16 ||
AllocaTy->getNumElements() < 2 ||
!VectorType::isValidElementType(AllocaTy->getElementType())) {
- DEBUG(dbgs() << " Cannot convert type to vector\n");
+ LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
return false;
}
@@ -370,7 +382,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
// If we can't compute a vector index from this GEP, then we can't
// promote this alloca to vector.
if (!Index) {
- DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
+ LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP
+ << '\n');
return false;
}
@@ -385,8 +398,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
- DEBUG(dbgs() << " Converting alloca to vector "
- << *AllocaTy << " -> " << *VectorTy << '\n');
+ LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
+ << *VectorTy << '\n');
for (Value *V : WorkList) {
Instruction *Inst = cast<Instruction>(V);
@@ -443,7 +456,8 @@ static bool isCallPromotable(CallInst *CI) {
case Intrinsic::lifetime_end:
case Intrinsic::invariant_start:
case Intrinsic::invariant_end:
- case Intrinsic::invariant_group_barrier:
+ case Intrinsic::launder_invariant_group:
+ case Intrinsic::strip_invariant_group:
case Intrinsic::objectsize:
return true;
default:
@@ -475,7 +489,8 @@ bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
// important part is both must have the same address space at
// the end.
if (OtherObj != BaseAlloca) {
- DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
+ LLVM_DEBUG(
+ dbgs() << "Found a binary instruction with another alloca object\n");
return false;
}
@@ -588,7 +603,7 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
FunctionType *FTy = F.getFunctionType();
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
// If the function has any arguments in the local address space, then it's
// possible these arguments require the entire local memory space, so
@@ -597,8 +612,8 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
LocalMemLimit = 0;
- DEBUG(dbgs() << "Function has local memory argument. Promoting to "
- "local memory disabled.\n");
+ LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+ "local memory disabled.\n");
return false;
}
}
@@ -667,13 +682,12 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
LocalMemLimit = MaxSizeWithWaveCount;
- DEBUG(
- dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
- << " Rounding size to " << MaxSizeWithWaveCount
- << " with a maximum occupancy of " << MaxOccupancy << '\n'
- << " and " << (LocalMemLimit - CurrentLocalMemUsage)
- << " available for promotion\n"
- );
+ LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
+ << " bytes of LDS\n"
+ << " Rounding size to " << MaxSizeWithWaveCount
+ << " with a maximum occupancy of " << MaxOccupancy << '\n'
+ << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+ << " available for promotion\n");
return true;
}
@@ -690,7 +704,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
// First try to replace the alloca with a vector
Type *AllocaTy = I.getAllocatedType();
- DEBUG(dbgs() << "Trying to promote " << I << '\n');
+ LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
if (tryPromoteAllocaToVector(&I, AS))
return true; // Promoted to vector.
@@ -706,7 +720,9 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
case CallingConv::SPIR_KERNEL:
break;
default:
- DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");
+ LLVM_DEBUG(
+ dbgs()
+ << " promote alloca to LDS not supported with calling convention.\n");
return false;
}
@@ -714,8 +730,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
if (!SufficientLDS)
return false;
- const AMDGPUSubtarget &ST =
- TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
const DataLayout &DL = Mod->getDataLayout();
@@ -735,8 +750,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
NewSize += AllocSize;
if (NewSize > LocalMemLimit) {
- DEBUG(dbgs() << " " << AllocSize
- << " bytes of local memory not available to promote\n");
+ LLVM_DEBUG(dbgs() << " " << AllocSize
+ << " bytes of local memory not available to promote\n");
return false;
}
@@ -745,11 +760,11 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
std::vector<Value*> WorkList;
if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
- DEBUG(dbgs() << " Do not know how to convert all uses\n");
+ LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
return false;
}
- DEBUG(dbgs() << "Promoting alloca to local memory\n");
+ LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
Function *F = I.getParent()->getParent();
@@ -843,31 +858,32 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
continue;
case Intrinsic::memcpy: {
MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
- Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
- MemCpy->getLength(), MemCpy->getAlignment(),
- MemCpy->isVolatile());
+ Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlignment(),
+ MemCpy->getRawSource(), MemCpy->getSourceAlignment(),
+ MemCpy->getLength(), MemCpy->isVolatile());
Intr->eraseFromParent();
continue;
}
case Intrinsic::memmove: {
MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
- Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
- MemMove->getLength(), MemMove->getAlignment(),
- MemMove->isVolatile());
+ Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlignment(),
+ MemMove->getRawSource(), MemMove->getSourceAlignment(),
+ MemMove->getLength(), MemMove->isVolatile());
Intr->eraseFromParent();
continue;
}
case Intrinsic::memset: {
MemSetInst *MemSet = cast<MemSetInst>(Intr);
Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
- MemSet->getLength(), MemSet->getAlignment(),
+ MemSet->getLength(), MemSet->getDestAlignment(),
MemSet->isVolatile());
Intr->eraseFromParent();
continue;
}
case Intrinsic::invariant_start:
case Intrinsic::invariant_end:
- case Intrinsic::invariant_group_barrier:
+ case Intrinsic::launder_invariant_group:
+ case Intrinsic::strip_invariant_group:
Intr->eraseFromParent();
// FIXME: I think the invariant marker should still theoretically apply,
// but the intrinsics need to be changed to accept pointers with any
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1ed02fae085a..012e4fe200aa 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -14,7 +14,9 @@
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -50,10 +52,38 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
}
-unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &A,
- const RegisterBank &B,
- unsigned Size) const {
- return RegisterBankInfo::copyCost(A, B, Size);
+static bool isConstant(const MachineOperand &MO, int64_t &C) {
+ const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const MachineInstr *Def = MRI.getVRegDef(MO.getReg());
+ if (!Def)
+ return false;
+
+ if (Def->getOpcode() == AMDGPU::G_CONSTANT) {
+ C = Def->getOperand(1).getCImm()->getSExtValue();
+ return true;
+ }
+
+ if (Def->getOpcode() == AMDGPU::COPY)
+ return isConstant(Def->getOperand(1), C);
+
+ return false;
+}
+
+unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
+ const RegisterBank &Src,
+ unsigned Size) const {
+ if (Dst.getID() == AMDGPU::SGPRRegBankID &&
+ Src.getID() == AMDGPU::VGPRRegBankID)
+ return std::numeric_limits<unsigned>::max();
+
+ // SGPRRegBank with size 1 is actually vcc or another 64-bit sgpr written by
+ // the valu.
+ if (Size == 1 && Dst.getID() == AMDGPU::SCCRegBankID &&
+ Src.getID() == AMDGPU::SGPRRegBankID)
+ return std::numeric_limits<unsigned>::max();
+
+ return RegisterBankInfo::copyCost(Dst, Src, Size);
}
const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
@@ -72,11 +102,11 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
InstructionMappings AltMappings;
switch (MI.getOpcode()) {
case TargetOpcode::G_LOAD: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
// FIXME: Should we be hard coding the size for these mappings?
const InstructionMapping &SSMapping = getInstructionMapping(
1, 1, getOperandsMapping(
@@ -104,6 +134,42 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
return AltMappings;
}
+ case TargetOpcode::G_ICMP: {
+ unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
+ const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
+ nullptr, // Predicate operand.
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+ 4); // Num Operands
+ AltMappings.push_back(&SSMapping);
+
+ const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+ nullptr, // Predicate operand.
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
+ 4); // Num Operands
+ AltMappings.push_back(&SVMapping);
+
+ const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+ nullptr, // Predicate operand.
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+ 4); // Num Operands
+ AltMappings.push_back(&VSMapping);
+
+ const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+ nullptr, // Predicate operand.
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
+ 4); // Num Operands
+ AltMappings.push_back(&VVMapping);
+
+ return AltMappings;
+ }
default:
break;
}
@@ -120,7 +186,60 @@ static bool isInstrUniform(const MachineInstr &MI) {
return false;
const MachineMemOperand *MMO = *MI.memoperands_begin();
- return AMDGPU::isUniformMMO(MMO);
+ return AMDGPUInstrInfo::isUniformMMO(MMO);
+}
+
+bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
+ unsigned Reg = MI.getOperand(i).getReg();
+ const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
+ if (Bank && Bank->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+ }
+ return true;
+}
+
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
+ OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ }
+ return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+ MI.getNumOperands());
+}
+
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+ unsigned OpdIdx = 0;
+
+ unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
+
+ if (MI.getOperand(OpdIdx).isIntrinsicID())
+ OpdsMapping[OpdIdx++] = nullptr;
+
+ unsigned Reg1 = MI.getOperand(OpdIdx).getReg();
+ unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
+ unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI);
+ OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
+
+ for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
+ unsigned Size = getSizeInBits(MI.getOperand(OpdIdx).getReg(), MRI, *TRI);
+ OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ }
+
+ return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+ MI.getNumOperands());
}
const RegisterBankInfo::InstructionMapping &
@@ -155,6 +274,22 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
// handle that during instruction selection?
}
+unsigned
+AMDGPURegisterBankInfo::getRegBankID(unsigned Reg,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI,
+ unsigned Default) const {
+
+ const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
+ return Bank ? Bank->getID() : Default;
+}
+
+///
+/// This function must return a legal mapping, because
+/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
+/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
+/// VGPR to SGPR generated is illegal.
+///
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
@@ -166,16 +301,102 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
- bool IsComplete = true;
switch (MI.getOpcode()) {
default:
- IsComplete = false;
+ return getInvalidInstructionMapping();
+ case AMDGPU::G_ADD:
+ case AMDGPU::G_SUB:
+ case AMDGPU::G_MUL:
+ case AMDGPU::G_AND:
+ case AMDGPU::G_OR:
+ case AMDGPU::G_XOR:
+ case AMDGPU::G_SHL:
+ if (isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ // Fall-through
+
+ case AMDGPU::G_FADD:
+ case AMDGPU::G_FPTOSI:
+ case AMDGPU::G_FPTOUI:
+ case AMDGPU::G_FMUL:
+ return getDefaultMappingVOP(MI);
+ case AMDGPU::G_IMPLICIT_DEF: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
+ }
+ case AMDGPU::G_FCONSTANT:
case AMDGPU::G_CONSTANT: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
+ case AMDGPU::G_EXTRACT: {
+ unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
+ OpdsMapping[2] = nullptr;
+ break;
+ }
+ case AMDGPU::G_MERGE_VALUES: {
+ unsigned Bank = isSALUMapping(MI) ?
+ AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
+ // Op1 and Dst should use the same register bank.
+ for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
+ OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
+ break;
+ }
+ case AMDGPU::G_BITCAST: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
+ break;
+ }
+ case AMDGPU::G_TRUNC: {
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned Src = MI.getOperand(1).getReg();
+ unsigned Bank = getRegBankID(Src, MRI, *TRI);
+ unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
+ unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
+ break;
+ }
+ case AMDGPU::G_ZEXT: {
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned Src = MI.getOperand(1).getReg();
+ unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
+ unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
+ unsigned SrcBank = getRegBankID(Src, MRI, *TRI,
+ SrcSize == 1 ? AMDGPU::SGPRRegBankID :
+ AMDGPU::VGPRRegBankID);
+ unsigned DstBank = SrcBank;
+ if (SrcSize == 1) {
+ if (SrcBank == AMDGPU::SGPRRegBankID)
+ DstBank = AMDGPU::VGPRRegBankID;
+ else
+ DstBank = AMDGPU::SGPRRegBankID;
+ }
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank, SrcSize);
+ break;
+ }
+ case AMDGPU::G_FCMP: {
+ unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 1);
+ OpdsMapping[1] = nullptr; // Predicate Operand.
+ OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ break;
+ }
case AMDGPU::G_GEP: {
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
if (!MI.getOperand(i).isReg())
@@ -204,24 +425,113 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
- case AMDGPU::G_LOAD:
- return getInstrMappingForLoad(MI);
+ case AMDGPU::G_ICMP: {
+ unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+ unsigned Op0Bank = Op2Bank == AMDGPU::SGPRRegBankID &&
+ Op3Bank == AMDGPU::SGPRRegBankID ?
+ AMDGPU::SCCRegBankID : AMDGPU::VGPRRegBankID;
+ OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
+ OpdsMapping[1] = nullptr; // Predicate Operand.
+ OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
+ break;
+ }
+
+
+ case AMDGPU::G_EXTRACT_VECTOR_ELT: {
+ unsigned IdxOp = 2;
+ int64_t Imm;
+ // XXX - Do we really need to fully handle these? The constant case should
+ // be legalized away before RegBankSelect?
+
+ unsigned OutputBankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ?
+ AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+
+ unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(0).getReg()).getSizeInBits());
+ OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(1).getReg()).getSizeInBits());
+
+ // The index can be either if the source vector is VGPR.
+ OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
+ break;
}
+ case AMDGPU::G_INSERT_VECTOR_ELT: {
+ // XXX - Do we really need to fully handle these? The constant case should
+ // be legalized away before RegBankSelect?
+
+ int64_t Imm;
+
+ unsigned IdxOp = MI.getOpcode() == AMDGPU::G_EXTRACT_VECTOR_ELT ? 2 : 3;
+ unsigned BankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ?
+ AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+
+
+
+ // TODO: Can do SGPR indexing, which would obviate the need for the
+ // isConstant check.
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
+ OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
+ }
- if (!IsComplete) {
- unsigned BankID = AMDGPU::SGPRRegBankID;
- unsigned Size = 0;
- for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) {
- // If the operand is not a register default to the size of the previous
- // operand.
- // FIXME: Can't we pull the types from the MachineInstr rather than the
- // operands.
- if (MI.getOperand(Idx).isReg())
- Size = getSizeInBits(MI.getOperand(Idx).getReg(), MRI, *TRI);
- OpdsMapping.push_back(AMDGPU::getValueMapping(BankID, Size));
+ break;
+ }
+ case AMDGPU::G_INTRINSIC: {
+ switch (MI.getOperand(1).getIntrinsicID()) {
+ default:
+ return getInvalidInstructionMapping();
+ case Intrinsic::maxnum:
+ case Intrinsic::minnum:
+ case Intrinsic::amdgcn_cvt_pkrtz:
+ return getDefaultMappingVOP(MI);
+ case Intrinsic::amdgcn_kernarg_segment_ptr: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ break;
+ }
+ }
+ break;
+ }
+ case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
+ switch (MI.getOperand(0).getIntrinsicID()) {
+ default:
+ return getInvalidInstructionMapping();
+ case Intrinsic::amdgcn_exp_compr:
+ OpdsMapping[0] = nullptr; // IntrinsicID
+ // FIXME: These are immediate values which can't be read from registers.
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ // FIXME: Could we support packed types here?
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ // FIXME: These are immediate values which can't be read from registers.
+ OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ break;
+ case Intrinsic::amdgcn_exp:
+ OpdsMapping[0] = nullptr; // IntrinsicID
+ // FIXME: These are immediate values which can't be read from registers.
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ // FIXME: Could we support packed types here?
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ // FIXME: These are immediate values which can't be read from registers.
+ OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ break;
}
+ break;
+ }
+ case AMDGPU::G_LOAD:
+ return getInstrMappingForLoad(MI);
}
+
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
MI.getNumOperands());
}
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 201fdc1974c6..d48a66589873 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -16,19 +16,15 @@
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#define GET_REGBANK_DECLARATIONS
+#include "AMDGPUGenRegisterBank.inc"
+#undef GET_REGBANK_DECLARATIONS
+
namespace llvm {
class SIRegisterInfo;
class TargetRegisterInfo;
-namespace AMDGPU {
-enum {
- SGPRRegBankID = 0,
- VGPRRegBankID = 1,
- NumRegisterBanks
-};
-} // End AMDGPU namespace.
-
/// This class provides the information for the target register banks.
class AMDGPUGenRegisterBankInfo : public RegisterBankInfo {
@@ -46,6 +42,13 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr &MI) const;
+ unsigned getRegBankID(unsigned Reg, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI,
+ unsigned Default = AMDGPU::VGPRRegBankID) const;
+
+ bool isSALUMapping(const MachineInstr &MI) const;
+ const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
+ const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
public:
AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI);
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index f4428e56035f..7f7f75f65647 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -14,3 +14,5 @@ def SGPRRegBank : RegisterBank<"SGPR",
def VGPRRegBank : RegisterBank<"VGPR",
[VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512]
>;
+
+def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS ]>;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 5e4d33aaa691..50f859addc2b 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -8,13 +8,15 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
+/// Parent TargetRegisterInfo class common to all hw codegen targets.
//
//===----------------------------------------------------------------------===//
#include "AMDGPURegisterInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
using namespace llvm;
@@ -25,7 +27,7 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
// they are not supported at this time.
//===----------------------------------------------------------------------===//
-unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) {
static const unsigned SubRegs[] = {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
@@ -37,6 +39,13 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
return SubRegs[Channel];
}
+void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
+ MCRegAliasIterator R(Reg, this, true);
+
+ for (; R.isValid(); ++R)
+ Reserved.set(*R);
+}
+
#define GET_REGINFO_TARGET_DESC
#include "AMDGPUGenRegisterInfo.inc"
@@ -75,5 +84,6 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
}
unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
- return AMDGPU::NoRegister;
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ return FuncInfo->getFrameOffsetReg();
}
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index d8604d2590f1..07de5fc549e2 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
+/// TargetRegisterInfo interface that is implemented by all hw codegen
/// targets.
//
//===----------------------------------------------------------------------===//
@@ -21,15 +21,19 @@
namespace llvm {
-class AMDGPUSubtarget;
+class GCNSubtarget;
class TargetInstrInfo;
struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
AMDGPURegisterInfo();
+ bool enableMultipleCopyHints() const override { return true; }
+
/// \returns the sub reg enum value for the given \p Channel
/// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
- unsigned getSubRegFromChannel(unsigned Channel) const;
+ static unsigned getSubRegFromChannel(unsigned Channel);
+
+ void reserveRegisterTuples(BitVector &, unsigned Reg) const;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
index 3bbcba826f63..ceabae524414 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
@@ -19,5 +19,4 @@ foreach Index = 0-15 in {
}
-include "R600RegisterInfo.td"
include "SIRegisterInfo.td"
diff --git a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 83e56a9ab495..a861762a8c9e 100644
--- a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -249,8 +249,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
SmallVector<Argument *, 4> OutArgs;
for (Argument &Arg : F.args()) {
if (isOutArgumentCandidate(Arg)) {
- DEBUG(dbgs() << "Found possible out argument " << Arg
- << " in function " << F.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Found possible out argument " << Arg
+ << " in function " << F.getName() << '\n');
OutArgs.push_back(&Arg);
}
}
@@ -310,7 +310,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
SI = dyn_cast<StoreInst>(Q.getInst());
if (SI) {
- DEBUG(dbgs() << "Found out argument store: " << *SI << '\n');
+ LLVM_DEBUG(dbgs() << "Found out argument store: " << *SI << '\n');
ReplaceableStores.emplace_back(RI, SI);
} else {
ThisReplaceable = false;
@@ -328,7 +328,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
if (llvm::find_if(ValVec,
[OutArg](const std::pair<Argument *, Value *> &Entry) {
return Entry.first == OutArg;}) != ValVec.end()) {
- DEBUG(dbgs() << "Saw multiple out arg stores" << *OutArg << '\n');
+ LLVM_DEBUG(dbgs()
+ << "Saw multiple out arg stores" << *OutArg << '\n');
// It is possible to see stores to the same argument multiple times,
// but we expect these would have been optimized out already.
ThisReplaceable = false;
@@ -358,7 +359,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
F.getFunctionType()->params(),
F.isVarArg());
- DEBUG(dbgs() << "Computed new return type: " << *NewRetTy << '\n');
+ LLVM_DEBUG(dbgs() << "Computed new return type: " << *NewRetTy << '\n');
Function *NewFunc = Function::Create(NewFuncTy, Function::PrivateLinkage,
F.getName() + ".body");
diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
new file mode 100644
index 000000000000..9dbd7751b4d8
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -0,0 +1,77 @@
+//===-- AMDGPUSearchableTables.td - ------------------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Resource intrinsics table.
+//===----------------------------------------------------------------------===//
+
+class RsrcIntrinsic<AMDGPURsrcIntrinsic intr> {
+ Intrinsic Intr = !cast<Intrinsic>(intr);
+ bits<8> RsrcArg = intr.RsrcArg;
+ bit IsImage = intr.IsImage;
+}
+
+def RsrcIntrinsics : GenericTable {
+ let FilterClass = "RsrcIntrinsic";
+ let Fields = ["Intr", "RsrcArg", "IsImage"];
+
+ let PrimaryKey = ["Intr"];
+ let PrimaryKeyName = "lookupRsrcIntrinsic";
+}
+
+foreach intr = !listconcat(AMDGPUBufferIntrinsics,
+ AMDGPUImageDimIntrinsics,
+ AMDGPUImageDimAtomicIntrinsics) in {
+ def : RsrcIntrinsic<!cast<AMDGPURsrcIntrinsic>(intr)>;
+}
+
+class SourceOfDivergence<Intrinsic intr> {
+ Intrinsic Intr = intr;
+}
+
+def SourcesOfDivergence : GenericTable {
+ let FilterClass = "SourceOfDivergence";
+ let Fields = ["Intr"];
+
+ let PrimaryKey = ["Intr"];
+ let PrimaryKeyName = "lookupSourceOfDivergence";
+}
+
+def : SourceOfDivergence<int_amdgcn_workitem_id_x>;
+def : SourceOfDivergence<int_amdgcn_workitem_id_y>;
+def : SourceOfDivergence<int_amdgcn_workitem_id_z>;
+def : SourceOfDivergence<int_amdgcn_interp_mov>;
+def : SourceOfDivergence<int_amdgcn_interp_p1>;
+def : SourceOfDivergence<int_amdgcn_interp_p2>;
+def : SourceOfDivergence<int_amdgcn_mbcnt_hi>;
+def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
+def : SourceOfDivergence<int_r600_read_tidig_x>;
+def : SourceOfDivergence<int_r600_read_tidig_y>;
+def : SourceOfDivergence<int_r600_read_tidig_z>;
+def : SourceOfDivergence<int_amdgcn_atomic_inc>;
+def : SourceOfDivergence<int_amdgcn_atomic_dec>;
+def : SourceOfDivergence<int_amdgcn_ds_fadd>;
+def : SourceOfDivergence<int_amdgcn_ds_fmin>;
+def : SourceOfDivergence<int_amdgcn_ds_fmax>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_smin>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_umin>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_smax>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_umax>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_ps_live>;
+def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
+
+foreach intr = AMDGPUImageDimAtomicIntrinsics in
+def : SourceOfDivergence<intr>;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 80feaa44766f..98b49070fa99 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
+/// Implements the AMDGPU specific subclass of TargetSubtarget.
//
//===----------------------------------------------------------------------===//
@@ -20,8 +20,10 @@
#include "AMDGPULegalizerInfo.h"
#include "AMDGPURegisterBankInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include <algorithm>
@@ -32,12 +34,37 @@ using namespace llvm;
#define GET_SUBTARGETINFO_TARGET_DESC
#define GET_SUBTARGETINFO_CTOR
+#define AMDGPUSubtarget GCNSubtarget
#include "AMDGPUGenSubtargetInfo.inc"
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#undef AMDGPUSubtarget
+#include "R600GenSubtargetInfo.inc"
-AMDGPUSubtarget::~AMDGPUSubtarget() = default;
+GCNSubtarget::~GCNSubtarget() = default;
+
+R600Subtarget &
+R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
+ StringRef GPU, StringRef FS) {
+ SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
+ FullFS += FS;
+ ParseSubtargetFeatures(GPU, FullFS);
+
+ // FIXME: I don't think think Evergreen has any useful support for
+ // denormals, but should be checked. Should we issue a warning somewhere
+ // if someone tries to enable these?
+ if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+ FP32Denormals = false;
+ }
+
+ HasMulU24 = getGeneration() >= EVERGREEN;
+ HasMulI24 = hasCaymanISA();
+
+ return *this;
+}
-AMDGPUSubtarget &
-AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
+GCNSubtarget &
+GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS) {
// Determine default and user-specified characteristics
// On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
@@ -92,26 +119,43 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
HasMovrel = true;
}
+ HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
+
return *this;
}
-AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
- const TargetMachine &TM)
- : AMDGPUGenSubtargetInfo(TT, GPU, FS),
+AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
+ const FeatureBitset &FeatureBits) :
+ TargetTriple(TT),
+ SubtargetFeatureBits(FeatureBits),
+ Has16BitInsts(false),
+ HasMadMixInsts(false),
+ FP32Denormals(false),
+ FPExceptions(false),
+ HasSDWA(false),
+ HasVOP3PInsts(false),
+ HasMulI24(true),
+ HasMulU24(true),
+ HasFminFmaxLegacy(true),
+ EnablePromoteAlloca(false),
+ LocalMemorySize(0),
+ WavefrontSize(0)
+ { }
+
+GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+ const GCNTargetMachine &TM) :
+ AMDGPUGenSubtargetInfo(TT, GPU, FS),
+ AMDGPUSubtarget(TT, getFeatureBits()),
TargetTriple(TT),
- Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
+ Gen(SOUTHERN_ISLANDS),
IsaVersion(ISAVersion0_0_0),
- WavefrontSize(0),
- LocalMemorySize(0),
LDSBankCount(0),
MaxPrivateElementSize(0),
FastFMAF32(false),
HalfRate64Ops(false),
- FP32Denormals(false),
FP64FP16Denormals(false),
- FPExceptions(false),
DX10Clamp(false),
FlatForGlobal(false),
AutoWaitcntBeforeBarrier(false),
@@ -123,57 +167,56 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
EnableXNACK(false),
TrapHandler(false),
DebuggerInsertNops(false),
- DebuggerReserveRegs(false),
DebuggerEmitPrologue(false),
EnableHugePrivateBuffer(false),
EnableVGPRSpilling(false),
- EnablePromoteAlloca(false),
EnableLoadStoreOpt(false),
EnableUnsafeDSOffsetFolding(false),
EnableSIScheduler(false),
+ EnableDS128(false),
DumpCode(false),
FP64(false),
- FMA(false),
- IsGCN(false),
GCN3Encoding(false),
CIInsts(false),
GFX9Insts(false),
SGPRInitBug(false),
HasSMemRealTime(false),
- Has16BitInsts(false),
HasIntClamp(false),
- HasVOP3PInsts(false),
- HasMadMixInsts(false),
+ HasFmaMixInsts(false),
HasMovrel(false),
HasVGPRIndexMode(false),
HasScalarStores(false),
+ HasScalarAtomics(false),
HasInv2PiInlineImm(false),
- HasSDWA(false),
HasSDWAOmod(false),
HasSDWAScalar(false),
HasSDWASdst(false),
HasSDWAMac(false),
HasSDWAOutModsVOPC(false),
HasDPP(false),
+ HasDLInsts(false),
+ D16PreservesUnusedBits(false),
FlatAddressSpace(false),
FlatInstOffsets(false),
FlatGlobalInsts(false),
FlatScratchInsts(false),
AddNoCarryInsts(false),
+ HasUnpackedD16VMem(false),
- R600ALUInst(false),
- CaymanISA(false),
- CFALUBug(false),
- HasVertexCache(false),
- TexVTXClauseSize(0),
ScalarizeGlobal(false),
FeatureDisable(false),
- InstrItins(getInstrItineraryForCPU(GPU)) {
+ InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
+ TLInfo(TM, *this),
+ FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
AS = AMDGPU::getAMDGPUAS(TT);
- initializeSubtargetDependencies(TT, GPU, FS);
+ CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
+ Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
+ RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
+ InstSelector.reset(new AMDGPUInstructionSelector(
+ *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
}
unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
@@ -198,6 +241,12 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
return NumWaves;
}
+unsigned
+AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
+ const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
+}
+
std::pair<unsigned, unsigned>
AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
switch (CC) {
@@ -357,27 +406,64 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
return true;
}
-R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
- const TargetMachine &TM) :
- AMDGPUSubtarget(TT, GPU, FS, TM),
- InstrInfo(*this),
- FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
- TLInfo(TM, *this) {}
+uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
+ unsigned &MaxAlign) const {
+ assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+ F.getCallingConv() == CallingConv::SPIR_KERNEL);
-SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
- const TargetMachine &TM)
- : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
- FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
- TLInfo(TM, *this) {
- CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
- Legalizer.reset(new AMDGPULegalizerInfo());
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ uint64_t ExplicitArgBytes = 0;
+ MaxAlign = 1;
- RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
- InstSelector.reset(new AMDGPUInstructionSelector(
- *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
+ for (const Argument &Arg : F.args()) {
+ Type *ArgTy = Arg.getType();
+
+ unsigned Align = DL.getABITypeAlignment(ArgTy);
+ uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
+ ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
+ MaxAlign = std::max(MaxAlign, Align);
+ }
+
+ return ExplicitArgBytes;
}
-void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
+ unsigned &MaxAlign) const {
+ uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
+
+ unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
+
+ uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
+ unsigned ImplicitBytes = getImplicitArgNumBytes(F);
+ if (ImplicitBytes != 0) {
+ unsigned Alignment = getAlignmentForImplicitArgPtr();
+ TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
+ }
+
+ // Being able to dereference past the end is useful for emitting scalar loads.
+ return alignTo(TotalSize, 4);
+}
+
+R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
+ const TargetMachine &TM) :
+ R600GenSubtargetInfo(TT, GPU, FS),
+ AMDGPUSubtarget(TT, getFeatureBits()),
+ InstrInfo(*this),
+ FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+ FMA(false),
+ CaymanISA(false),
+ CFALUBug(false),
+ DX10Clamp(false),
+ HasVertexCache(false),
+ R600ALUInst(false),
+ FP64(false),
+ TexVTXClauseSize(0),
+ Gen(R600),
+ TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
+ InstrItins(getInstrItineraryForCPU(GPU)),
+ AS (AMDGPU::getAMDGPUAS(TT)) { }
+
+void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const {
// Track register pressure so the scheduler can try to decrease
// pressure once register usage is above the threshold defined by
@@ -394,22 +480,12 @@ void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Policy.ShouldTrackLaneMasks = true;
}
-bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
+bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
}
-unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
- unsigned ExplicitArgBytes) const {
- unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
- if (ImplicitBytes == 0)
- return ExplicitArgBytes;
-
- unsigned Alignment = getAlignmentForImplicitArgPtr();
- return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
-}
-
-unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
- if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
+ if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
if (SGPRs <= 80)
return 10;
if (SGPRs <= 88)
@@ -431,7 +507,7 @@ unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
return 5;
}
-unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
+unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
if (VGPRs <= 24)
return 10;
if (VGPRs <= 28)
@@ -453,7 +529,7 @@ unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
return 1;
}
-unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
if (MFI.hasFlatScratchInit()) {
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
@@ -467,7 +543,7 @@ unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
return 2; // VCC.
}
-unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -517,7 +593,7 @@ unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
MaxAddressableNumSGPRs);
}
-unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -532,10 +608,6 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
unsigned Requested = AMDGPU::getIntegerAttribute(
F, "amdgpu-num-vgpr", MaxNumVGPRs);
- // Make sure requested value does not violate subtarget's specifications.
- if (Requested && Requested <= getReservedNumVGPRs(MF))
- Requested = 0;
-
// Make sure requested value is compatible with values implied by
// default/requested minimum/maximum number of waves per execution unit.
if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
@@ -548,7 +620,7 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
MaxNumVGPRs = Requested;
}
- return MaxNumVGPRs - getReservedNumVGPRs(MF);
+ return MaxNumVGPRs;
}
namespace {
@@ -602,7 +674,21 @@ struct MemOpClusterMutation : ScheduleDAGMutation {
};
} // namespace
-void SISubtarget::getPostRAMutations(
+void GCNSubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
}
+
+const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
+ if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
+ return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
+ else
+ return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
+}
+
+const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
+ if (TM.getTargetTriple().getArch() == Triple::amdgcn)
+ return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
+ else
+ return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
+}
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index cf4a691d4b58..623109733651 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -8,7 +8,7 @@
//==-----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU specific subclass of TargetSubtarget.
+/// AMDGPU specific subclass of TargetSubtarget.
//
//===----------------------------------------------------------------------===//
@@ -23,7 +23,6 @@
#include "SIFrameLowering.h"
#include "SIISelLowering.h"
#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/Triple.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
@@ -40,24 +39,216 @@
#define GET_SUBTARGETINFO_HEADER
#include "AMDGPUGenSubtargetInfo.inc"
+#define GET_SUBTARGETINFO_HEADER
+#include "R600GenSubtargetInfo.inc"
namespace llvm {
class StringRef;
-class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+class AMDGPUSubtarget {
public:
enum Generation {
R600 = 0,
- R700,
- EVERGREEN,
- NORTHERN_ISLANDS,
- SOUTHERN_ISLANDS,
- SEA_ISLANDS,
- VOLCANIC_ISLANDS,
- GFX9,
+ R700 = 1,
+ EVERGREEN = 2,
+ NORTHERN_ISLANDS = 3,
+ SOUTHERN_ISLANDS = 4,
+ SEA_ISLANDS = 5,
+ VOLCANIC_ISLANDS = 6,
+ GFX9 = 7
};
+private:
+ Triple TargetTriple;
+
+protected:
+ const FeatureBitset &SubtargetFeatureBits;
+ bool Has16BitInsts;
+ bool HasMadMixInsts;
+ bool FP32Denormals;
+ bool FPExceptions;
+ bool HasSDWA;
+ bool HasVOP3PInsts;
+ bool HasMulI24;
+ bool HasMulU24;
+ bool HasFminFmaxLegacy;
+ bool EnablePromoteAlloca;
+ int LocalMemorySize;
+ unsigned WavefrontSize;
+
+public:
+ AMDGPUSubtarget(const Triple &TT, const FeatureBitset &FeatureBits);
+
+ static const AMDGPUSubtarget &get(const MachineFunction &MF);
+ static const AMDGPUSubtarget &get(const TargetMachine &TM,
+ const Function &F);
+
+ /// \returns Default range flat work group size for a calling convention.
+ std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
+
+ /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
+ /// for function \p F, or minimum/maximum flat work group sizes explicitly
+ /// requested using "amdgpu-flat-work-group-size" attribute attached to
+ /// function \p F.
+ ///
+ /// \returns Subtarget's default values if explicitly requested values cannot
+ /// be converted to integer, or violate subtarget's specifications.
+ std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
+
+ /// \returns Subtarget's default pair of minimum/maximum number of waves per
+ /// execution unit for function \p F, or minimum/maximum number of waves per
+ /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
+ /// attached to function \p F.
+ ///
+ /// \returns Subtarget's default values if explicitly requested values cannot
+ /// be converted to integer, violate subtarget's specifications, or are not
+ /// compatible with minimum/maximum number of waves limited by flat work group
+ /// size, register usage, and/or lds usage.
+ std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
+
+ /// Return the amount of LDS that can be used that will not restrict the
+ /// occupancy lower than WaveCount.
+ unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
+ const Function &) const;
+
+ /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
+ /// the given LDS memory size is the only constraint.
+ unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
+
+ unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
+
+ bool isAmdHsaOS() const {
+ return TargetTriple.getOS() == Triple::AMDHSA;
+ }
+
+ bool isAmdPalOS() const {
+ return TargetTriple.getOS() == Triple::AMDPAL;
+ }
+
+ bool isMesa3DOS() const {
+ return TargetTriple.getOS() == Triple::Mesa3D;
+ }
+
+ bool isMesaKernel(const Function &F) const {
+ return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
+ }
+
+ bool isAmdCodeObjectV2(const Function &F) const {
+ return isAmdHsaOS() || isMesaKernel(F);
+ }
+
+ bool has16BitInsts() const {
+ return Has16BitInsts;
+ }
+
+ bool hasMadMixInsts() const {
+ return HasMadMixInsts;
+ }
+
+ bool hasFP32Denormals() const {
+ return FP32Denormals;
+ }
+
+ bool hasFPExceptions() const {
+ return FPExceptions;
+ }
+
+ bool hasSDWA() const {
+ return HasSDWA;
+ }
+
+ bool hasVOP3PInsts() const {
+ return HasVOP3PInsts;
+ }
+
+ bool hasMulI24() const {
+ return HasMulI24;
+ }
+
+ bool hasMulU24() const {
+ return HasMulU24;
+ }
+
+ bool hasFminFmaxLegacy() const {
+ return HasFminFmaxLegacy;
+ }
+
+ bool isPromoteAllocaEnabled() const {
+ return EnablePromoteAlloca;
+ }
+
+ unsigned getWavefrontSize() const {
+ return WavefrontSize;
+ }
+
+ int getLocalMemorySize() const {
+ return LocalMemorySize;
+ }
+
+ unsigned getAlignmentForImplicitArgPtr() const {
+ return isAmdHsaOS() ? 8 : 4;
+ }
+
+ /// Returns the offset in bytes from the start of the input buffer
+ /// of the first explicit kernel argument.
+ unsigned getExplicitKernelArgOffset(const Function &F) const {
+ return isAmdCodeObjectV2(F) ? 0 : 36;
+ }
+
+ /// \returns Maximum number of work groups per compute unit supported by the
+ /// subtarget and limited by given \p FlatWorkGroupSize.
+ unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
+ return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(SubtargetFeatureBits,
+ FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum flat work group size supported by the subtarget.
+ unsigned getMinFlatWorkGroupSize() const {
+ return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(SubtargetFeatureBits);
+ }
+
+ /// \returns Maximum flat work group size supported by the subtarget.
+ unsigned getMaxFlatWorkGroupSize() const {
+ return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(SubtargetFeatureBits);
+ }
+
+ /// \returns Maximum number of waves per execution unit supported by the
+ /// subtarget and limited by given \p FlatWorkGroupSize.
+ unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
+ return AMDGPU::IsaInfo::getMaxWavesPerEU(SubtargetFeatureBits,
+ FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum number of waves per execution unit supported by the
+ /// subtarget.
+ unsigned getMinWavesPerEU() const {
+ return AMDGPU::IsaInfo::getMinWavesPerEU(SubtargetFeatureBits);
+ }
+
+ unsigned getMaxWavesPerEU() const { return 10; }
+
+ /// Creates value range metadata on an workitemid.* inrinsic call or load.
+ bool makeLIDRangeMetadata(Instruction *I) const;
+
+ /// \returns Number of bytes of arguments that are passed to a shader or
+ /// kernel in addition to the explicit ones declared for the function.
+ unsigned getImplicitArgNumBytes(const Function &F) const {
+ if (isMesaKernel(F))
+ return 16;
+ return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
+ }
+ uint64_t getExplicitKernArgSize(const Function &F,
+ unsigned &MaxAlign) const;
+ unsigned getKernArgSegmentSize(const Function &F,
+ unsigned &MaxAlign) const;
+
+ virtual ~AMDGPUSubtarget() {}
+};
+
+class GCNSubtarget : public AMDGPUGenSubtargetInfo,
+ public AMDGPUSubtarget {
+public:
enum {
ISAVersion0_0_0,
ISAVersion6_0_0,
@@ -67,13 +258,14 @@ public:
ISAVersion7_0_2,
ISAVersion7_0_3,
ISAVersion7_0_4,
- ISAVersion8_0_0,
ISAVersion8_0_1,
ISAVersion8_0_2,
ISAVersion8_0_3,
ISAVersion8_1_0,
ISAVersion9_0_0,
- ISAVersion9_0_2
+ ISAVersion9_0_2,
+ ISAVersion9_0_4,
+ ISAVersion9_0_6,
};
enum TrapHandlerAbi {
@@ -96,13 +288,18 @@ public:
LLVMTrapHandlerRegValue = 1
};
+private:
+ /// GlobalISel related APIs.
+ std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
protected:
// Basic subtarget description.
Triple TargetTriple;
- Generation Gen;
+ unsigned Gen;
unsigned IsaVersion;
- unsigned WavefrontSize;
- int LocalMemorySize;
int LDSBankCount;
unsigned MaxPrivateElementSize;
@@ -111,9 +308,7 @@ protected:
bool HalfRate64Ops;
// Dynamially set bits that enable features.
- bool FP32Denormals;
bool FP64FP16Denormals;
- bool FPExceptions;
bool DX10Clamp;
bool FlatForGlobal;
bool AutoWaitcntBeforeBarrier;
@@ -124,47 +319,48 @@ protected:
bool EnableXNACK;
bool TrapHandler;
bool DebuggerInsertNops;
- bool DebuggerReserveRegs;
bool DebuggerEmitPrologue;
// Used as options.
bool EnableHugePrivateBuffer;
bool EnableVGPRSpilling;
- bool EnablePromoteAlloca;
bool EnableLoadStoreOpt;
bool EnableUnsafeDSOffsetFolding;
bool EnableSIScheduler;
+ bool EnableDS128;
bool DumpCode;
// Subtarget statically properties set by tablegen
bool FP64;
bool FMA;
+ bool MIMG_R128;
bool IsGCN;
bool GCN3Encoding;
bool CIInsts;
bool GFX9Insts;
bool SGPRInitBug;
bool HasSMemRealTime;
- bool Has16BitInsts;
bool HasIntClamp;
- bool HasVOP3PInsts;
- bool HasMadMixInsts;
+ bool HasFmaMixInsts;
bool HasMovrel;
bool HasVGPRIndexMode;
bool HasScalarStores;
+ bool HasScalarAtomics;
bool HasInv2PiInlineImm;
- bool HasSDWA;
bool HasSDWAOmod;
bool HasSDWAScalar;
bool HasSDWASdst;
bool HasSDWAMac;
bool HasSDWAOutModsVOPC;
bool HasDPP;
+ bool HasDLInsts;
+ bool D16PreservesUnusedBits;
bool FlatAddressSpace;
bool FlatInstOffsets;
bool FlatGlobalInsts;
bool FlatScratchInsts;
bool AddNoCarryInsts;
+ bool HasUnpackedD16VMem;
bool R600ALUInst;
bool CaymanISA;
bool CFALUBug;
@@ -175,67 +371,68 @@ protected:
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable;
- InstrItineraryData InstrItins;
SelectionDAGTargetInfo TSInfo;
AMDGPUAS AS;
+private:
+ SIInstrInfo InstrInfo;
+ SITargetLowering TLInfo;
+ SIFrameLowering FrameLowering;
public:
- AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
- const TargetMachine &TM);
- ~AMDGPUSubtarget() override;
+ GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+ const GCNTargetMachine &TM);
+ ~GCNSubtarget() override;
- AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT,
+ GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS);
- const AMDGPUInstrInfo *getInstrInfo() const override = 0;
- const AMDGPUFrameLowering *getFrameLowering() const override = 0;
- const AMDGPUTargetLowering *getTargetLowering() const override = 0;
- const AMDGPURegisterInfo *getRegisterInfo() const override = 0;
+ const SIInstrInfo *getInstrInfo() const override {
+ return &InstrInfo;
+ }
- const InstrItineraryData *getInstrItineraryData() const override {
- return &InstrItins;
+ const SIFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
}
- // Nothing implemented, just prevent crashes on use.
- const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
- return &TSInfo;
+ const SITargetLowering *getTargetLowering() const override {
+ return &TLInfo;
}
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ const SIRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
- bool isAmdHsaOS() const {
- return TargetTriple.getOS() == Triple::AMDHSA;
+ const CallLowering *getCallLowering() const override {
+ return CallLoweringInfo.get();
}
- bool isMesa3DOS() const {
- return TargetTriple.getOS() == Triple::Mesa3D;
+ const InstructionSelector *getInstructionSelector() const override {
+ return InstSelector.get();
}
- bool isOpenCLEnv() const {
- return TargetTriple.getEnvironment() == Triple::OpenCL ||
- TargetTriple.getEnvironmentName() == "amdgizcl";
+ const LegalizerInfo *getLegalizerInfo() const override {
+ return Legalizer.get();
}
- bool isAmdPalOS() const {
- return TargetTriple.getOS() == Triple::AMDPAL;
+ const RegisterBankInfo *getRegBankInfo() const override {
+ return RegBankInfo.get();
}
- Generation getGeneration() const {
- return Gen;
+ // Nothing implemented, just prevent crashes on use.
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
}
- unsigned getWavefrontSize() const {
- return WavefrontSize;
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ Generation getGeneration() const {
+ return (Generation)Gen;
}
unsigned getWavefrontSizeLog2() const {
return Log2_32(WavefrontSize);
}
- int getLocalMemorySize() const {
- return LocalMemorySize;
- }
-
int getLDSBankCount() const {
return LDSBankCount;
}
@@ -248,19 +445,19 @@ public:
return AS;
}
- bool has16BitInsts() const {
- return Has16BitInsts;
- }
-
bool hasIntClamp() const {
return HasIntClamp;
}
- bool hasVOP3PInsts() const {
- return HasVOP3PInsts;
+ bool hasFP64() const {
+ return FP64;
}
- bool hasFP64() const {
+ bool hasMIMG_R128() const {
+ return MIMG_R128;
+ }
+
+ bool hasHWFP64() const {
return FP64;
}
@@ -273,15 +470,15 @@ public:
}
bool hasAddr64() const {
- return (getGeneration() < VOLCANIC_ISLANDS);
+ return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
}
bool hasBFE() const {
- return (getGeneration() >= EVERGREEN);
+ return true;
}
bool hasBFI() const {
- return (getGeneration() >= EVERGREEN);
+ return true;
}
bool hasBFM() const {
@@ -289,62 +486,31 @@ public:
}
bool hasBCNT(unsigned Size) const {
- if (Size == 32)
- return (getGeneration() >= EVERGREEN);
-
- if (Size == 64)
- return (getGeneration() >= SOUTHERN_ISLANDS);
-
- return false;
- }
-
- bool hasMulU24() const {
- return (getGeneration() >= EVERGREEN);
- }
-
- bool hasMulI24() const {
- return (getGeneration() >= SOUTHERN_ISLANDS ||
- hasCaymanISA());
+ return true;
}
bool hasFFBL() const {
- return (getGeneration() >= EVERGREEN);
+ return true;
}
bool hasFFBH() const {
- return (getGeneration() >= EVERGREEN);
+ return true;
}
bool hasMed3_16() const {
- return getGeneration() >= GFX9;
+ return getGeneration() >= AMDGPUSubtarget::GFX9;
}
bool hasMin3Max3_16() const {
- return getGeneration() >= GFX9;
- }
-
- bool hasMadMixInsts() const {
- return HasMadMixInsts;
+ return getGeneration() >= AMDGPUSubtarget::GFX9;
}
- bool hasSBufferLoadStoreAtomicDwordxN() const {
- // Only use the "x1" variants on GFX9 or don't use the buffer variants.
- // For x2 and higher variants, if the accessed region spans 2 VM pages and
- // the second page is unmapped, the hw hangs.
- // TODO: There is one future GFX9 chip that doesn't have this bug.
- return getGeneration() != GFX9;
+ bool hasFmaMixInsts() const {
+ return HasFmaMixInsts;
}
bool hasCARRY() const {
- return (getGeneration() >= EVERGREEN);
- }
-
- bool hasBORROW() const {
- return (getGeneration() >= EVERGREEN);
- }
-
- bool hasCaymanISA() const {
- return CaymanISA;
+ return true;
}
bool hasFMA() const {
@@ -359,10 +525,6 @@ public:
return EnableHugePrivateBuffer;
}
- bool isPromoteAllocaEnabled() const {
- return EnablePromoteAlloca;
- }
-
bool unsafeDSOffsetFoldingEnabled() const {
return EnableUnsafeDSOffsetFolding;
}
@@ -376,23 +538,10 @@ public:
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
const Function &) const;
- /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
- /// the given LDS memory size is the only constraint.
- unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
-
- unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
- const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
- return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
- }
-
bool hasFP16Denormals() const {
return FP64FP16Denormals;
}
- bool hasFP32Denormals() const {
- return FP32Denormals;
- }
-
bool hasFP64Denormals() const {
return FP64FP16Denormals;
}
@@ -401,10 +550,6 @@ public:
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
- bool hasFPExceptions() const {
- return FPExceptions;
- }
-
bool enableDX10Clamp() const {
return DX10Clamp;
}
@@ -417,6 +562,12 @@ public:
return FlatForGlobal;
}
+ /// \returns If target supports ds_read/write_b128 and user enables generation
+ /// of ds_read/write_b128.
+ bool useDS128() const {
+ return CIInsts && EnableDS128;
+ }
+
/// \returns If MUBUF instructions always perform range checking, even for
/// buffer resources used for private memory access.
bool privateMemoryResourceIsRangeChecked() const {
@@ -440,7 +591,7 @@ public:
}
bool hasApertureRegs() const {
- return HasApertureRegs;
+ return HasApertureRegs;
}
bool isTrapHandlerEnabled() const {
@@ -467,6 +618,10 @@ public:
return FlatScratchInsts;
}
+ bool hasFlatLgkmVMemCountInOrder() const {
+ return getGeneration() > GFX9;
+ }
+
bool hasD16LoadStore() const {
return getGeneration() >= GFX9;
}
@@ -481,31 +636,19 @@ public:
return AddNoCarryInsts;
}
- bool isMesaKernel(const MachineFunction &MF) const {
- return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction().getCallingConv());
+ bool hasUnpackedD16VMem() const {
+ return HasUnpackedD16VMem;
}
// Covers VS/PS/CS graphics shaders
- bool isMesaGfxShader(const MachineFunction &MF) const {
- return isMesa3DOS() && AMDGPU::isShader(MF.getFunction().getCallingConv());
- }
-
- bool isAmdCodeObjectV2(const MachineFunction &MF) const {
- return isAmdHsaOS() || isMesaKernel(MF);
+ bool isMesaGfxShader(const Function &F) const {
+ return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
}
bool hasMad64_32() const {
return getGeneration() >= SEA_ISLANDS;
}
- bool hasFminFmaxLegacy() const {
- return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
- }
-
- bool hasSDWA() const {
- return HasSDWA;
- }
-
bool hasSDWAOmod() const {
return HasSDWAOmod;
}
@@ -526,29 +669,28 @@ public:
return HasSDWAOutModsVOPC;
}
- /// \brief Returns the offset in bytes from the start of the input buffer
- /// of the first explicit kernel argument.
- unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
- return isAmdCodeObjectV2(MF) ? 0 : 36;
+ bool vmemWriteNeedsExpWaitcnt() const {
+ return getGeneration() < SEA_ISLANDS;
}
- unsigned getAlignmentForImplicitArgPtr() const {
- return isAmdHsaOS() ? 8 : 4;
+ bool hasDLInsts() const {
+ return HasDLInsts;
}
- unsigned getImplicitArgNumBytes(const MachineFunction &MF) const {
- if (isMesaKernel(MF))
- return 16;
- if (isAmdHsaOS() && isOpenCLEnv())
- return 32;
- return 0;
+ bool d16PreservesUnusedBits() const {
+ return D16PreservesUnusedBits;
}
// Scratch is allocated in 256 dword per wave blocks for the entire
// wavefront. When viewed from the perspecive of an arbitrary workitem, this
// is 4-byte aligned.
+ //
+ // Only 4-byte alignment is really needed to access anything. Transformations
+ // on the pointer value itself may rely on the alignment / known low bits of
+ // the pointer. Set this to something above the minimum to avoid needing
+ // dynamic realignment in common cases.
unsigned getStackAlignment() const {
- return 4;
+ return 16;
}
bool enableMachineScheduler() const override {
@@ -559,184 +701,43 @@ public:
return true;
}
- void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
- bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
+ void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
+ bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
/// \returns Number of execution units per compute unit supported by the
/// subtarget.
unsigned getEUsPerCU() const {
- return AMDGPU::IsaInfo::getEUsPerCU(getFeatureBits());
- }
-
- /// \returns Maximum number of work groups per compute unit supported by the
- /// subtarget and limited by given \p FlatWorkGroupSize.
- unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBits(),
- FlatWorkGroupSize);
+ return AMDGPU::IsaInfo::getEUsPerCU(MCSubtargetInfo::getFeatureBits());
}
/// \returns Maximum number of waves per compute unit supported by the
/// subtarget without any kind of limitation.
unsigned getMaxWavesPerCU() const {
- return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits());
+ return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits());
}
/// \returns Maximum number of waves per compute unit supported by the
/// subtarget and limited by given \p FlatWorkGroupSize.
unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits(),
+ return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits(),
FlatWorkGroupSize);
}
- /// \returns Minimum number of waves per execution unit supported by the
- /// subtarget.
- unsigned getMinWavesPerEU() const {
- return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBits());
- }
-
/// \returns Maximum number of waves per execution unit supported by the
/// subtarget without any kind of limitation.
unsigned getMaxWavesPerEU() const {
- return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits());
- }
-
- /// \returns Maximum number of waves per execution unit supported by the
- /// subtarget and limited by given \p FlatWorkGroupSize.
- unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits(),
- FlatWorkGroupSize);
- }
-
- /// \returns Minimum flat work group size supported by the subtarget.
- unsigned getMinFlatWorkGroupSize() const {
- return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBits());
- }
-
- /// \returns Maximum flat work group size supported by the subtarget.
- unsigned getMaxFlatWorkGroupSize() const {
- return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBits());
+ return AMDGPU::IsaInfo::getMaxWavesPerEU();
}
/// \returns Number of waves per work group supported by the subtarget and
/// limited by given \p FlatWorkGroupSize.
unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getWavesPerWorkGroup(getFeatureBits(),
- FlatWorkGroupSize);
+ return AMDGPU::IsaInfo::getWavesPerWorkGroup(
+ MCSubtargetInfo::getFeatureBits(), FlatWorkGroupSize);
}
- /// \returns Default range flat work group size for a calling convention.
- std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
-
- /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
- /// for function \p F, or minimum/maximum flat work group sizes explicitly
- /// requested using "amdgpu-flat-work-group-size" attribute attached to
- /// function \p F.
- ///
- /// \returns Subtarget's default values if explicitly requested values cannot
- /// be converted to integer, or violate subtarget's specifications.
- std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
-
- /// \returns Subtarget's default pair of minimum/maximum number of waves per
- /// execution unit for function \p F, or minimum/maximum number of waves per
- /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
- /// attached to function \p F.
- ///
- /// \returns Subtarget's default values if explicitly requested values cannot
- /// be converted to integer, violate subtarget's specifications, or are not
- /// compatible with minimum/maximum number of waves limited by flat work group
- /// size, register usage, and/or lds usage.
- std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
-
- /// Creates value range metadata on an workitemid.* inrinsic call or load.
- bool makeLIDRangeMetadata(Instruction *I) const;
-};
-
-class R600Subtarget final : public AMDGPUSubtarget {
-private:
- R600InstrInfo InstrInfo;
- R600FrameLowering FrameLowering;
- R600TargetLowering TLInfo;
-
-public:
- R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
- const TargetMachine &TM);
-
- const R600InstrInfo *getInstrInfo() const override {
- return &InstrInfo;
- }
-
- const R600FrameLowering *getFrameLowering() const override {
- return &FrameLowering;
- }
-
- const R600TargetLowering *getTargetLowering() const override {
- return &TLInfo;
- }
-
- const R600RegisterInfo *getRegisterInfo() const override {
- return &InstrInfo.getRegisterInfo();
- }
-
- bool hasCFAluBug() const {
- return CFALUBug;
- }
-
- bool hasVertexCache() const {
- return HasVertexCache;
- }
-
- short getTexVTXClauseSize() const {
- return TexVTXClauseSize;
- }
-};
-
-class SISubtarget final : public AMDGPUSubtarget {
-private:
- SIInstrInfo InstrInfo;
- SIFrameLowering FrameLowering;
- SITargetLowering TLInfo;
-
- /// GlobalISel related APIs.
- std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
- std::unique_ptr<InstructionSelector> InstSelector;
- std::unique_ptr<LegalizerInfo> Legalizer;
- std::unique_ptr<RegisterBankInfo> RegBankInfo;
-
-public:
- SISubtarget(const Triple &TT, StringRef CPU, StringRef FS,
- const TargetMachine &TM);
-
- const SIInstrInfo *getInstrInfo() const override {
- return &InstrInfo;
- }
-
- const SIFrameLowering *getFrameLowering() const override {
- return &FrameLowering;
- }
-
- const SITargetLowering *getTargetLowering() const override {
- return &TLInfo;
- }
-
- const CallLowering *getCallLowering() const override {
- return CallLoweringInfo.get();
- }
-
- const InstructionSelector *getInstructionSelector() const override {
- return InstSelector.get();
- }
-
- const LegalizerInfo *getLegalizerInfo() const override {
- return Legalizer.get();
- }
-
- const RegisterBankInfo *getRegBankInfo() const override {
- return RegBankInfo.get();
- }
-
- const SIRegisterInfo *getRegisterInfo() const override {
- return &InstrInfo.getRegisterInfo();
- }
+ // static wrappers
+ static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
// XXX - Why is this here if it isn't in the default pass set?
bool enableEarlyIfConversion() const override {
@@ -746,7 +747,7 @@ public:
void overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const override;
- bool isVGPRSpillingEnabled(const Function& F) const;
+ bool isVGPRSpillingEnabled(const Function &F) const;
unsigned getMaxNumUserSGPRs() const {
return 16;
@@ -776,6 +777,10 @@ public:
return HasScalarStores;
}
+ bool hasScalarAtomics() const {
+ return HasScalarAtomics;
+ }
+
bool hasInv2PiInlineImm() const {
return HasInv2PiInlineImm;
}
@@ -789,18 +794,13 @@ public:
}
bool debuggerSupported() const {
- return debuggerInsertNops() && debuggerReserveRegs() &&
- debuggerEmitPrologue();
+ return debuggerInsertNops() && debuggerEmitPrologue();
}
bool debuggerInsertNops() const {
return DebuggerInsertNops;
}
- bool debuggerReserveRegs() const {
- return DebuggerReserveRegs;
- }
-
bool debuggerEmitPrologue() const {
return DebuggerEmitPrologue;
}
@@ -829,52 +829,61 @@ public:
return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
}
- unsigned getKernArgSegmentSize(const MachineFunction &MF,
- unsigned ExplictArgBytes) const;
-
- /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
+ /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
+ /// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
- /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs
+ /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
+ /// VGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
/// \returns true if the flat_scratch register should be initialized with the
/// pointer to the wave's scratch memory rather than a size and offset.
bool flatScratchIsPointer() const {
+ return getGeneration() >= AMDGPUSubtarget::GFX9;
+ }
+
+ /// \returns true if the machine has merged shaders in which s0-s7 are
+ /// reserved by the hardware and user SGPRs start at s8
+ bool hasMergedShaders() const {
return getGeneration() >= GFX9;
}
/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
- return AMDGPU::IsaInfo::getSGPRAllocGranule(getFeatureBits());
+ return AMDGPU::IsaInfo::getSGPRAllocGranule(
+ MCSubtargetInfo::getFeatureBits());
}
/// \returns SGPR encoding granularity supported by the subtarget.
unsigned getSGPREncodingGranule() const {
- return AMDGPU::IsaInfo::getSGPREncodingGranule(getFeatureBits());
+ return AMDGPU::IsaInfo::getSGPREncodingGranule(
+ MCSubtargetInfo::getFeatureBits());
}
/// \returns Total number of SGPRs supported by the subtarget.
unsigned getTotalNumSGPRs() const {
- return AMDGPU::IsaInfo::getTotalNumSGPRs(getFeatureBits());
+ return AMDGPU::IsaInfo::getTotalNumSGPRs(MCSubtargetInfo::getFeatureBits());
}
/// \returns Addressable number of SGPRs supported by the subtarget.
unsigned getAddressableNumSGPRs() const {
- return AMDGPU::IsaInfo::getAddressableNumSGPRs(getFeatureBits());
+ return AMDGPU::IsaInfo::getAddressableNumSGPRs(
+ MCSubtargetInfo::getFeatureBits());
}
/// \returns Minimum number of SGPRs that meets the given number of waves per
/// execution unit requirement supported by the subtarget.
unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMinNumSGPRs(getFeatureBits(), WavesPerEU);
+ return AMDGPU::IsaInfo::getMinNumSGPRs(MCSubtargetInfo::getFeatureBits(),
+ WavesPerEU);
}
/// \returns Maximum number of SGPRs that meets the given number of waves per
/// execution unit requirement supported by the subtarget.
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
- return AMDGPU::IsaInfo::getMaxNumSGPRs(getFeatureBits(), WavesPerEU,
- Addressable);
+ return AMDGPU::IsaInfo::getMaxNumSGPRs(MCSubtargetInfo::getFeatureBits(),
+ WavesPerEU, Addressable);
}
/// \returns Reserved number of SGPRs for given function \p MF.
@@ -892,39 +901,39 @@ public:
/// \returns VGPR allocation granularity supported by the subtarget.
unsigned getVGPRAllocGranule() const {
- return AMDGPU::IsaInfo::getVGPRAllocGranule(getFeatureBits());
+ return AMDGPU::IsaInfo::getVGPRAllocGranule(
+ MCSubtargetInfo::getFeatureBits());
}
/// \returns VGPR encoding granularity supported by the subtarget.
unsigned getVGPREncodingGranule() const {
- return AMDGPU::IsaInfo::getVGPREncodingGranule(getFeatureBits());
+ return AMDGPU::IsaInfo::getVGPREncodingGranule(
+ MCSubtargetInfo::getFeatureBits());
}
/// \returns Total number of VGPRs supported by the subtarget.
unsigned getTotalNumVGPRs() const {
- return AMDGPU::IsaInfo::getTotalNumVGPRs(getFeatureBits());
+ return AMDGPU::IsaInfo::getTotalNumVGPRs(MCSubtargetInfo::getFeatureBits());
}
/// \returns Addressable number of VGPRs supported by the subtarget.
unsigned getAddressableNumVGPRs() const {
- return AMDGPU::IsaInfo::getAddressableNumVGPRs(getFeatureBits());
+ return AMDGPU::IsaInfo::getAddressableNumVGPRs(
+ MCSubtargetInfo::getFeatureBits());
}
/// \returns Minimum number of VGPRs that meets given number of waves per
/// execution unit requirement supported by the subtarget.
unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMinNumVGPRs(getFeatureBits(), WavesPerEU);
+ return AMDGPU::IsaInfo::getMinNumVGPRs(MCSubtargetInfo::getFeatureBits(),
+ WavesPerEU);
}
/// \returns Maximum number of VGPRs that meets given number of waves per
/// execution unit requirement supported by the subtarget.
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMaxNumVGPRs(getFeatureBits(), WavesPerEU);
- }
-
- /// \returns Reserved number of VGPRs for given function \p MF.
- unsigned getReservedNumVGPRs(const MachineFunction &MF) const {
- return debuggerReserveRegs() ? 4 : 0;
+ return AMDGPU::IsaInfo::getMaxNumVGPRs(MCSubtargetInfo::getFeatureBits(),
+ WavesPerEU);
}
/// \returns Maximum number of VGPRs that meets number of waves per execution
@@ -942,6 +951,119 @@ public:
const override;
};
+class R600Subtarget final : public R600GenSubtargetInfo,
+ public AMDGPUSubtarget {
+private:
+ R600InstrInfo InstrInfo;
+ R600FrameLowering FrameLowering;
+ bool FMA;
+ bool CaymanISA;
+ bool CFALUBug;
+ bool DX10Clamp;
+ bool HasVertexCache;
+ bool R600ALUInst;
+ bool FP64;
+ short TexVTXClauseSize;
+ Generation Gen;
+ R600TargetLowering TLInfo;
+ InstrItineraryData InstrItins;
+ SelectionDAGTargetInfo TSInfo;
+ AMDGPUAS AS;
+
+public:
+ R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ const TargetMachine &TM);
+
+ const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
+ const R600FrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+
+ const R600TargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+
+ const R600RegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
+ }
+
+ // Nothing implemented, just prevent crashes on use.
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+ Generation getGeneration() const {
+ return Gen;
+ }
+
+ unsigned getStackAlignment() const {
+ return 4;
+ }
+
+ R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
+ StringRef GPU, StringRef FS);
+
+ bool hasBFE() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasBFI() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasBCNT(unsigned Size) const {
+ if (Size == 32)
+ return (getGeneration() >= EVERGREEN);
+
+ return false;
+ }
+
+ bool hasBORROW() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasCARRY() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasCaymanISA() const {
+ return CaymanISA;
+ }
+
+ bool hasFFBL() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasFFBH() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasFMA() const { return FMA; }
+
+ bool hasCFAluBug() const { return CFALUBug; }
+
+ bool hasVertexCache() const { return HasVertexCache; }
+
+ short getTexVTXClauseSize() const { return TexVTXClauseSize; }
+
+ AMDGPUAS getAMDGPUAS() const { return AS; }
+
+ bool enableMachineScheduler() const override {
+ return true;
+ }
+
+ bool enableSubRegLiveness() const override {
+ return true;
+ }
+};
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2042dbf6d5e2..2205819c444f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief The AMDGPU target machine contains all of the hardware specific
+/// The AMDGPU target machine contains all of the hardware specific
/// information needed to emit code for R600 and SI GPUs.
//
//===----------------------------------------------------------------------===//
@@ -31,7 +31,6 @@
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
@@ -40,6 +39,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -79,7 +79,7 @@ static cl::opt<bool> EnableLoadStoreVectorizer(
cl::init(true),
cl::Hidden);
-// Option to to control global loads scalarization
+// Option to control global loads scalarization
static cl::opt<bool> ScalarizeGlobal(
"amdgpu-scalarize-global-loads",
cl::desc("Enable global load scalarization"),
@@ -110,12 +110,6 @@ static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
cl::desc("Enable AMDGPU Alias Analysis"),
cl::init(true));
-// Option to enable new waitcnt insertion pass.
-static cl::opt<bool> EnableSIInsertWaitcntsPass(
- "enable-si-insert-waitcnts",
- cl::desc("Use new waitcnt insertion pass"),
- cl::init(true));
-
// Option to run late CFG structurizer
static cl::opt<bool, true> LateCFGStructurize(
"amdgpu-late-structurize",
@@ -123,16 +117,23 @@ static cl::opt<bool, true> LateCFGStructurize(
cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
cl::Hidden);
-static cl::opt<bool> EnableAMDGPUFunctionCalls(
+static cl::opt<bool, true> EnableAMDGPUFunctionCalls(
"amdgpu-function-calls",
- cl::Hidden,
cl::desc("Enable AMDGPU function call support"),
- cl::init(false));
+ cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
+ cl::init(false),
+ cl::Hidden);
// Enable lib calls simplifications
static cl::opt<bool> EnableLibCallSimplify(
"amdgpu-simplify-libcall",
- cl::desc("Enable mdgpu library simplifications"),
+ cl::desc("Enable amdgpu library simplifications"),
+ cl::init(true),
+ cl::Hidden);
+
+static cl::opt<bool> EnableLowerKernelArguments(
+ "amdgpu-ir-lower-kernel-arguments",
+ cl::desc("Lower kernel argument loads in IR pass"),
cl::init(true),
cl::Hidden);
@@ -147,6 +148,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeR600PacketizerPass(*PR);
initializeR600ExpandSpecialInstrsPassPass(*PR);
initializeR600VectorRegMergerPass(*PR);
+ initializeGlobalISel(*PR);
initializeAMDGPUDAGToDAGISelPass(*PR);
initializeSILowerI1CopiesPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
@@ -160,6 +162,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
+ initializeAMDGPULowerKernelArgumentsPass(*PR);
+ initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
@@ -167,7 +171,6 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
- initializeSIInsertWaitsPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
@@ -176,6 +179,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIDebuggerInsertNopsPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
initializeSIFixWWMLivenessPass(*PR);
+ initializeSIFormMemoryClausesPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUUseNativeCallsPass(*PR);
@@ -260,24 +264,15 @@ GCNILPSchedRegistry("gcn-ilp",
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
- if (TT.getEnvironmentName() == "amdgiz" ||
- TT.getEnvironmentName() == "amdgizcl")
return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
- return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
}
// 32-bit private, local, and region pointers. 64-bit global, constant and
// flat.
- if (TT.getEnvironmentName() == "amdgiz" ||
- TT.getEnvironmentName() == "amdgizcl")
- return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
+ return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
- return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
- "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
}
LLVM_READNONE
@@ -317,9 +312,10 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
initAsmInfo();
}
-AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
-
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
+bool AMDGPUTargetMachine::EnableFunctionCalls = false;
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
Attribute GPUAttr = F.getFnAttribute("target-cpu");
@@ -412,6 +408,10 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
// Add infer address spaces pass to the opt pipeline after inlining
// but before SROA to increase SROA opportunities.
PM.add(createInferAddressSpacesPass());
+
+ // This should run after inlining to have any chance of doing anything,
+ // and before other cleanup optimizations.
+ PM.add(createAMDGPULowerKernelAttributesPass());
});
}
@@ -449,6 +449,11 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
return I.get();
}
+TargetTransformInfo
+R600TargetMachine::getTargetTransformInfo(const Function &F) {
+ return TargetTransformInfo(R600TTIImpl(this, F));
+}
+
//===----------------------------------------------------------------------===//
// GCN Target Machine (SI+)
//===----------------------------------------------------------------------===//
@@ -461,7 +466,7 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OL, bool JIT)
: AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
-const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
StringRef GPU = getGPUName(F);
StringRef FS = getFeatureString(F);
@@ -474,7 +479,7 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
+ I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
}
I->setScalarizeGlobalBehavior(ScalarizeGlobal);
@@ -482,6 +487,11 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
return I.get();
}
+TargetTransformInfo
+GCNTargetMachine::getTargetTransformInfo(const Function &F) {
+ return TargetTransformInfo(GCNTTIImpl(this, F));
+}
+
//===----------------------------------------------------------------------===//
// AMDGPU Pass Setup
//===----------------------------------------------------------------------===//
@@ -571,11 +581,6 @@ public:
} // end anonymous namespace
-TargetTransformInfo
-AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) {
- return TargetTransformInfo(AMDGPUTTIImpl(this, F));
-}
-
void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
if (getOptLevel() == CodeGenOpt::Aggressive)
addPass(createGVNPass());
@@ -584,6 +589,7 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
}
void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+ addPass(createLICMPass());
addPass(createSeparateConstOffsetFromGEPPass());
addPass(createSpeculativeExecutionPass());
// ReassociateGEPs exposes more opportunites for SLSR. See
@@ -629,7 +635,8 @@ void AMDGPUPassConfig::addIRPasses() {
}
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
- addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+ if (TM.getTargetTriple().getArch() == Triple::r600)
+ addPass(createR600OpenCLImageTypeLoweringPass());
// Replace OpenCL enqueued block function pointers with global variables.
addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
@@ -672,6 +679,10 @@ void AMDGPUPassConfig::addIRPasses() {
}
void AMDGPUPassConfig::addCodeGenPrepare() {
+ if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
+ EnableLowerKernelArguments)
+ addPass(createAMDGPULowerKernelArgumentsPass());
+
TargetPassConfig::addCodeGenPrepare();
if (EnableLoadStoreVectorizer)
@@ -739,7 +750,7 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
MachineSchedContext *C) const {
- const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
if (ST.enableSIScheduler())
return createSIMachineScheduler(C);
return createGCNMaxOccupancyMachineScheduler(C);
@@ -782,7 +793,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
addPass(&SILoadStoreOptimizerID);
if (EnableSDWAPeephole) {
addPass(&SIPeepholeSDWAID);
- addPass(&MachineLICMID);
+ addPass(&EarlyMachineLICMID);
addPass(&MachineCSEID);
addPass(&SIFoldOperandsID);
addPass(&DeadMachineInstructionElimID);
@@ -851,6 +862,8 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
+ insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
+
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
@@ -873,6 +886,10 @@ void GCNPassConfig::addPreSched2() {
}
void GCNPassConfig::addPreEmitPass() {
+ addPass(createSIMemoryLegalizerPass());
+ addPass(createSIInsertWaitcntsPass());
+ addPass(createSIShrinkInstructionsPass());
+
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
// are multiple scheduling regions in a basic block, the regions are scheduled
@@ -881,15 +898,12 @@ void GCNPassConfig::addPreEmitPass() {
//
// Here we add a stand-alone hazard recognizer pass which can handle all
// cases.
+ //
+ // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
+ // be better for it to emit S_NOP <N> when possible.
addPass(&PostRAHazardRecognizerID);
- if (EnableSIInsertWaitcntsPass)
- addPass(createSIInsertWaitcntsPass());
- else
- addPass(createSIInsertWaitsPass());
- addPass(createSIShrinkInstructionsPass());
addPass(&SIInsertSkipsPassID);
- addPass(createSIMemoryLegalizerPass());
addPass(createSIDebuggerInsertNopsPass());
addPass(&BranchRelaxationPassID);
}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 5f9b2a7fca20..0fe14493fabd 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
+/// The AMDGPU TargetMachine interface definition for hw codgen targets.
//
//===----------------------------------------------------------------------===//
@@ -34,7 +34,6 @@ namespace llvm {
class AMDGPUTargetMachine : public LLVMTargetMachine {
protected:
std::unique_ptr<TargetLoweringObjectFile> TLOF;
- AMDGPUIntrinsicInfo IntrinsicInfo;
AMDGPUAS AS;
StringRef getGPUName(const Function &F) const;
@@ -42,6 +41,7 @@ protected:
public:
static bool EnableLateStructurizeCFG;
+ static bool EnableFunctionCalls;
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, TargetOptions Options,
@@ -49,13 +49,8 @@ public:
CodeGenOpt::Level OL);
~AMDGPUTargetMachine() override;
- const AMDGPUSubtarget *getSubtargetImpl() const;
- const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override = 0;
-
- const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
- return &IntrinsicInfo;
- }
- TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+ const TargetSubtargetInfo *getSubtargetImpl() const;
+ const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override = 0;
TargetLoweringObjectFile *getObjFileLowering() const override {
return TLOF.get();
@@ -91,6 +86,8 @@ public:
const R600Subtarget *getSubtargetImpl(const Function &) const override;
+ TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
bool isMachineVerifierClean() const override {
return false;
}
@@ -102,7 +99,8 @@ public:
class GCNTargetMachine final : public AMDGPUTargetMachine {
private:
- mutable StringMap<std::unique_ptr<SISubtarget>> SubtargetMap;
+ AMDGPUIntrinsicInfo IntrinsicInfo;
+ mutable StringMap<std::unique_ptr<GCNSubtarget>> SubtargetMap;
public:
GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -112,7 +110,13 @@ public:
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
- const SISubtarget *getSubtargetImpl(const Function &) const override;
+ const GCNSubtarget *getSubtargetImpl(const Function &) const override;
+
+ TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
+ const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
+ return &IntrinsicInfo;
+ }
bool useIPRA() const override {
return true;
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index ca6210f69298..dd9dc1a88fc2 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// \brief This file declares the AMDGPU-specific subclass of
+/// This file declares the AMDGPU-specific subclass of
/// TargetLoweringObjectFile.
///
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 77c2d4b956c6..a68b8d03f06e 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -17,12 +17,12 @@
#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPUSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
@@ -43,6 +43,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include <algorithm>
@@ -101,7 +102,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
unsigned ThresholdPrivate = UnrollThresholdPrivate;
unsigned ThresholdLocal = UnrollThresholdLocal;
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
- AMDGPUAS ASST = ST->getAMDGPUAS();
+ const AMDGPUAS &ASST = AMDGPU::getAMDGPUAS(TargetTriple);
for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getModule()->getDataLayout();
unsigned LocalGEPsSeen = 0;
@@ -123,8 +124,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
continue;
if (dependsOnLocalPhi(L, Br->getCondition())) {
UP.Threshold += UnrollThresholdIf;
- DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
- << " for loop:\n" << *L << " due to " << *Br << '\n');
+ LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
+ << " for loop:\n"
+ << *L << " due to " << *Br << '\n');
if (UP.Threshold >= MaxBoost)
return;
}
@@ -200,61 +202,76 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
// Don't use the maximum allowed value here as it will make some
// programs way too big.
UP.Threshold = Threshold;
- DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
- << *L << " due to " << *GEP << '\n');
+ LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
+ << " for loop:\n"
+ << *L << " due to " << *GEP << '\n');
if (UP.Threshold >= MaxBoost)
return;
}
}
}
-unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
// The concept of vector registers doesn't really exist. Some packed vector
// operations operate on the normal 32-bit registers.
-
- // Number of VGPRs on SI.
- if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
- return 256;
-
- return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+ return 256;
}
-unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
+unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
// This is really the number of registers to fill when vectorizing /
// interleaving loops, so we lie to avoid trying to use all registers.
return getHardwareNumberOfRegisters(Vec) >> 3;
}
-unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
+unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
return 32;
}
-unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
+unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
return 32;
}
-unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
+unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
+ unsigned ChainSizeInBytes,
+ VectorType *VecTy) const {
+ unsigned VecRegBitWidth = VF * LoadSize;
+ if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
+ // TODO: Support element-size less than 32bit?
+ return 128 / LoadSize;
+
+ return VF;
+}
+
+unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
+ unsigned ChainSizeInBytes,
+ VectorType *VecTy) const {
+ unsigned VecRegBitWidth = VF * StoreSize;
+ if (VecRegBitWidth > 128)
+ return 128 / StoreSize;
+
+ return VF;
+}
+
+unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
AMDGPUAS AS = ST->getAMDGPUAS();
if (AddrSpace == AS.GLOBAL_ADDRESS ||
AddrSpace == AS.CONSTANT_ADDRESS ||
- AddrSpace == AS.FLAT_ADDRESS)
- return 128;
- if (AddrSpace == AS.LOCAL_ADDRESS ||
+ AddrSpace == AS.CONSTANT_ADDRESS_32BIT) {
+ return 512;
+ }
+
+ if (AddrSpace == AS.FLAT_ADDRESS ||
+ AddrSpace == AS.LOCAL_ADDRESS ||
AddrSpace == AS.REGION_ADDRESS)
- return 64;
+ return 128;
+
if (AddrSpace == AS.PRIVATE_ADDRESS)
return 8 * ST->getMaxPrivateElementSize();
- if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
- (AddrSpace == AS.PARAM_D_ADDRESS ||
- AddrSpace == AS.PARAM_I_ADDRESS ||
- (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
- AddrSpace <= AS.CONSTANT_BUFFER_15)))
- return 128;
llvm_unreachable("unhandled address space");
}
-bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
// We allow vectorization of flat stores, even though we may need to decompose
@@ -267,19 +284,19 @@ bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
return true;
}
-bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
-bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
-unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Disable unrolling if the loop is not vectorized.
// TODO: Enable this again.
if (VF == 1)
@@ -288,11 +305,14 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
return 8;
}
-bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const {
switch (Inst->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec: {
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
if (!Ordering || !Volatile)
@@ -314,7 +334,7 @@ bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
}
}
-int AMDGPUTTIImpl::getArithmeticInstrCost(
+int GCNTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
@@ -424,7 +444,7 @@ int AMDGPUTTIImpl::getArithmeticInstrCost(
Opd1PropInfo, Opd2PropInfo);
}
-unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
+unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
// XXX - For some reason this isn't called for switch.
switch (Opcode) {
case Instruction::Br:
@@ -435,7 +455,38 @@ unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
}
}
-int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
+ bool IsPairwise) {
+ EVT OrigTy = TLI->getValueType(DL, Ty);
+
+ // Computes cost on targets that have packed math instructions(which support
+ // 16-bit types only).
+ if (IsPairwise ||
+ !ST->hasVOP3PInsts() ||
+ OrigTy.getScalarSizeInBits() != 16)
+ return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ return LT.first * getFullRateInstrCost();
+}
+
+int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
+ bool IsPairwise,
+ bool IsUnsigned) {
+ EVT OrigTy = TLI->getValueType(DL, Ty);
+
+ // Computes cost on targets that have packed math instructions(which support
+ // 16-bit types only).
+ if (IsPairwise ||
+ !ST->hasVOP3PInsts() ||
+ OrigTy.getScalarSizeInBits() != 16)
+ return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ return LT.first * getHalfRateInstrCost();
+}
+
+int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
unsigned Index) {
switch (Opcode) {
case Instruction::ExtractElement:
@@ -460,52 +511,7 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
}
}
-static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) {
- switch (I->getIntrinsicID()) {
- case Intrinsic::amdgcn_workitem_id_x:
- case Intrinsic::amdgcn_workitem_id_y:
- case Intrinsic::amdgcn_workitem_id_z:
- case Intrinsic::amdgcn_interp_mov:
- case Intrinsic::amdgcn_interp_p1:
- case Intrinsic::amdgcn_interp_p2:
- case Intrinsic::amdgcn_mbcnt_hi:
- case Intrinsic::amdgcn_mbcnt_lo:
- case Intrinsic::r600_read_tidig_x:
- case Intrinsic::r600_read_tidig_y:
- case Intrinsic::r600_read_tidig_z:
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
- case Intrinsic::amdgcn_image_atomic_swap:
- case Intrinsic::amdgcn_image_atomic_add:
- case Intrinsic::amdgcn_image_atomic_sub:
- case Intrinsic::amdgcn_image_atomic_smin:
- case Intrinsic::amdgcn_image_atomic_umin:
- case Intrinsic::amdgcn_image_atomic_smax:
- case Intrinsic::amdgcn_image_atomic_umax:
- case Intrinsic::amdgcn_image_atomic_and:
- case Intrinsic::amdgcn_image_atomic_or:
- case Intrinsic::amdgcn_image_atomic_xor:
- case Intrinsic::amdgcn_image_atomic_inc:
- case Intrinsic::amdgcn_image_atomic_dec:
- case Intrinsic::amdgcn_image_atomic_cmpswap:
- case Intrinsic::amdgcn_buffer_atomic_swap:
- case Intrinsic::amdgcn_buffer_atomic_add:
- case Intrinsic::amdgcn_buffer_atomic_sub:
- case Intrinsic::amdgcn_buffer_atomic_smin:
- case Intrinsic::amdgcn_buffer_atomic_umin:
- case Intrinsic::amdgcn_buffer_atomic_smax:
- case Intrinsic::amdgcn_buffer_atomic_umax:
- case Intrinsic::amdgcn_buffer_atomic_and:
- case Intrinsic::amdgcn_buffer_atomic_or:
- case Intrinsic::amdgcn_buffer_atomic_xor:
- case Intrinsic::amdgcn_buffer_atomic_cmpswap:
- case Intrinsic::amdgcn_ps_live:
- case Intrinsic::amdgcn_ds_swizzle:
- return true;
- default:
- return false;
- }
-}
+
static bool isArgPassedInSGPR(const Argument *A) {
const Function *F = A->getParent();
@@ -535,7 +541,7 @@ static bool isArgPassedInSGPR(const Argument *A) {
/// \returns true if the result of the value could potentially be
/// different across workitems in a wavefront.
-bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
+bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
if (const Argument *A = dyn_cast<Argument>(V))
return !isArgPassedInSGPR(A);
@@ -556,7 +562,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
return true;
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
- return isIntrinsicSourceOfDivergence(Intrinsic);
+ return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
// Assume all function calls are a source of divergence.
if (isa<CallInst>(V) || isa<InvokeInst>(V))
@@ -565,7 +571,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
return false;
}
-bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
+bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
switch (Intrinsic->getIntrinsicID()) {
default:
@@ -578,7 +584,7 @@ bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
return false;
}
-unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
if (ST->hasVOP3PInsts()) {
VectorType *VT = cast<VectorType>(Tp);
@@ -601,7 +607,7 @@ unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Inde
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
-bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller,
+bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
const FeatureBitset &CallerBits =
@@ -613,3 +619,114 @@ bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller,
FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
}
+
+void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::UnrollingPreferences &UP) {
+ CommonTTI.getUnrollingPreferences(L, SE, UP);
+}
+
+unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+ return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+}
+
+unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
+ return getHardwareNumberOfRegisters(Vec);
+}
+
+unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
+ return 32;
+}
+
+unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
+ return 32;
+}
+
+unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
+ AMDGPUAS AS = ST->getAMDGPUAS();
+ if (AddrSpace == AS.GLOBAL_ADDRESS ||
+ AddrSpace == AS.CONSTANT_ADDRESS)
+ return 128;
+ if (AddrSpace == AS.LOCAL_ADDRESS ||
+ AddrSpace == AS.REGION_ADDRESS)
+ return 64;
+ if (AddrSpace == AS.PRIVATE_ADDRESS)
+ return 32;
+
+ if ((AddrSpace == AS.PARAM_D_ADDRESS ||
+ AddrSpace == AS.PARAM_I_ADDRESS ||
+ (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
+ AddrSpace <= AS.CONSTANT_BUFFER_15)))
+ return 128;
+ llvm_unreachable("unhandled address space");
+}
+
+bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+ unsigned Alignment,
+ unsigned AddrSpace) const {
+ // We allow vectorization of flat stores, even though we may need to decompose
+ // them later if they may access private memory. We don't have enough context
+ // here, and legalization can handle it.
+ if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS)
+ return false;
+ return true;
+}
+
+bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+ unsigned Alignment,
+ unsigned AddrSpace) const {
+ return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+ unsigned Alignment,
+ unsigned AddrSpace) const {
+ return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+ // Disable unrolling if the loop is not vectorized.
+ // TODO: Enable this again.
+ if (VF == 1)
+ return 1;
+
+ return 8;
+}
+
+unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
+ // XXX - For some reason this isn't called for switch.
+ switch (Opcode) {
+ case Instruction::Br:
+ case Instruction::Ret:
+ return 10;
+ default:
+ return BaseT::getCFInstrCost(Opcode);
+ }
+}
+
+int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index) {
+ switch (Opcode) {
+ case Instruction::ExtractElement:
+ case Instruction::InsertElement: {
+ unsigned EltSize
+ = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
+ if (EltSize < 32) {
+ return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+ }
+
+ // Extracts are just reads of a subregister, so are free. Inserts are
+ // considered free because we don't want to have any cost for scalarizing
+ // operations, and we don't have to copy into a different register class.
+
+ // Dynamic indexing isn't free and is best avoided.
+ return Index == ~0u ? 2 : 0;
+ }
+ default:
+ return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+ }
+}
+
+void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::UnrollingPreferences &UP) {
+ CommonTTI.getUnrollingPreferences(L, SE, UP);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 8899d2c6da8a..8e63d789e17d 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -21,6 +21,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -44,8 +45,26 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
friend BaseT;
- const AMDGPUSubtarget *ST;
+ Triple TargetTriple;
+
+public:
+ explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ TargetTriple(TM->getTargetTriple()) {}
+
+ void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::UnrollingPreferences &UP);
+};
+
+class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
+ using BaseT = BasicTTIImplBase<GCNTTIImpl>;
+ using TTI = TargetTransformInfo;
+
+ friend BaseT;
+
+ const GCNSubtarget *ST;
const AMDGPUTargetLowering *TLI;
+ AMDGPUTTIImpl CommonTTI;
bool IsGraphicsShader;
const FeatureBitset InlineFeatureIgnoreList = {
@@ -61,7 +80,6 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
AMDGPU::FeatureAutoWaitcntBeforeBarrier,
AMDGPU::FeatureDebuggerEmitPrologue,
AMDGPU::FeatureDebuggerInsertNops,
- AMDGPU::FeatureDebuggerReserveRegs,
// Property of the kernel/environment which can't actually differ.
AMDGPU::FeatureSGPRInitBug,
@@ -73,7 +91,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
AMDGPU::HalfRate64Ops
};
- const AMDGPUSubtarget *getST() const { return ST; }
+ const GCNSubtarget *getST() const { return ST; }
const AMDGPUTargetLowering *getTLI() const { return TLI; }
static inline int getFullRateInstrCost() {
@@ -98,10 +116,11 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
}
public:
- explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()),
- ST(TM->getSubtargetImpl(F)),
+ ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
TLI(ST->getTargetLowering()),
+ CommonTTI(TM, F),
IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {}
bool hasBranchDivergence() { return true; }
@@ -118,6 +137,12 @@ public:
unsigned getNumberOfRegisters(bool Vector) const;
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getMinVectorRegisterBitWidth() const;
+ unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
+ unsigned ChainSizeInBytes,
+ VectorType *VecTy) const;
+ unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
+ unsigned ChainSizeInBytes,
+ VectorType *VecTy) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
@@ -166,6 +191,53 @@ public:
const Function *Callee) const;
unsigned getInliningThresholdMultiplier() { return 9; }
+
+ int getArithmeticReductionCost(unsigned Opcode,
+ Type *Ty,
+ bool IsPairwise);
+ int getMinMaxReductionCost(Type *Ty, Type *CondTy,
+ bool IsPairwiseForm,
+ bool IsUnsigned);
+};
+
+class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
+ using BaseT = BasicTTIImplBase<R600TTIImpl>;
+ using TTI = TargetTransformInfo;
+
+ friend BaseT;
+
+ const R600Subtarget *ST;
+ const AMDGPUTargetLowering *TLI;
+ AMDGPUTTIImpl CommonTTI;
+
+public:
+ explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
+ TLI(ST->getTargetLowering()),
+ CommonTTI(TM, F) {}
+
+ const R600Subtarget *getST() const { return ST; }
+ const AMDGPUTargetLowering *getTLI() const { return TLI; }
+
+ void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::UnrollingPreferences &UP);
+ unsigned getHardwareNumberOfRegisters(bool Vec) const;
+ unsigned getNumberOfRegisters(bool Vec) const;
+ unsigned getRegisterBitWidth(bool Vector) const;
+ unsigned getMinVectorRegisterBitWidth() const;
+ unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
+ bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment,
+ unsigned AddrSpace) const;
+ bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+ unsigned Alignment,
+ unsigned AddrSpace) const;
+ bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+ unsigned Alignment,
+ unsigned AddrSpace) const;
+ unsigned getMaxInterleaveFactor(unsigned VF);
+ unsigned getCFInstrCost(unsigned Opcode);
+ int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 6107f3a7dd18..0d3a1673696a 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -28,6 +28,7 @@
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
@@ -39,7 +40,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils.h"
using namespace llvm;
@@ -144,7 +145,8 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
if (PN)
PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
- BB->getInstList().pop_back(); // Remove the return insn
+ // Remove and delete the return inst.
+ BB->getTerminator()->eraseFromParent();
BranchInst::Create(NewRetBlock, BB);
}
@@ -168,6 +170,9 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
SmallVector<BasicBlock *, 4> ReturningBlocks;
SmallVector<BasicBlock *, 4> UnreachableBlocks;
+ // Dummy return block for infinite loop.
+ BasicBlock *DummyReturnBB = nullptr;
+
for (BasicBlock *BB : PDT.getRoots()) {
if (isa<ReturnInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
@@ -175,6 +180,35 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
} else if (isa<UnreachableInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
UnreachableBlocks.push_back(BB);
+ } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+
+ ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
+ if (DummyReturnBB == nullptr) {
+ DummyReturnBB = BasicBlock::Create(F.getContext(),
+ "DummyReturnBlock", &F);
+ Type *RetTy = F.getReturnType();
+ Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+ ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+ ReturningBlocks.push_back(DummyReturnBB);
+ }
+
+ if (BI->isUnconditional()) {
+ BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
+ BI->eraseFromParent(); // Delete the unconditional branch.
+ // Add a new conditional branch with a dummy edge to the return block.
+ BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
+ } else { // Conditional branch.
+ // Create a new transition block to hold the conditional branch.
+ BasicBlock *TransitionBB = BasicBlock::Create(F.getContext(),
+ "TransitionBlock", &F);
+
+ // Move BI from BB to the new transition block.
+ BI->removeFromParent();
+ TransitionBB->getInstList().push_back(BI);
+
+ // Create a branch that will always branch to the transition block.
+ BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+ }
}
}
@@ -189,7 +223,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
new UnreachableInst(F.getContext(), UnreachableBlock);
for (BasicBlock *BB : UnreachableBlocks) {
- BB->getInstList().pop_back(); // Remove the unreachable inst.
+ // Remove and delete the unreachable inst.
+ BB->getTerminator()->eraseFromParent();
BranchInst::Create(UnreachableBlock, BB);
}
}
@@ -200,7 +235,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
Type *RetTy = F.getReturnType();
Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
- UnreachableBlock->getInstList().pop_back(); // Remove the unreachable inst.
+ // Remove and delete the unreachable inst.
+ UnreachableBlock->getTerminator()->eraseFromParent();
Function *UnreachableIntrin =
Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
index b78568e89cfb..1f6d9234c1ed 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
// \file
-// \brief This pass that unifies multiple OpenCL metadata due to linking.
+// This pass that unifies multiple OpenCL metadata due to linking.
//
//===----------------------------------------------------------------------===//
@@ -37,7 +37,7 @@ namespace {
} // end namespace kOCLMD
- /// \brief Unify multiple OpenCL metadata due to linking.
+ /// Unify multiple OpenCL metadata due to linking.
class AMDGPUUnifyMetadata : public ModulePass {
public:
static char ID;
@@ -47,7 +47,7 @@ namespace {
private:
bool runOnModule(Module &M) override;
- /// \brief Unify version metadata.
+ /// Unify version metadata.
/// \return true if changes are made.
/// Assume the named metadata has operands each of which is a pair of
/// integer constant, e.g.
@@ -82,7 +82,7 @@ namespace {
return true;
}
- /// \brief Unify version metadata.
+ /// Unify version metadata.
/// \return true if changes are made.
/// Assume the named metadata has operands each of which is a list e.g.
/// !Name = {!n1, !n2}
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 0a0e43123ae0..11cd49e5b3dc 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -11,6 +11,7 @@
#include "AMDGPUSubtarget.h"
#include "R600InstrInfo.h"
#include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SCCIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
@@ -28,12 +29,12 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
#include <cstddef>
@@ -78,23 +79,18 @@ namespace {
//
//===----------------------------------------------------------------------===//
-#define SHOWNEWINSTR(i) \
- DEBUG(dbgs() << "New instr: " << *i << "\n");
+#define SHOWNEWINSTR(i) LLVM_DEBUG(dbgs() << "New instr: " << *i << "\n");
-#define SHOWNEWBLK(b, msg) \
-DEBUG( \
- dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
- dbgs() << "\n"; \
-);
+#define SHOWNEWBLK(b, msg) \
+ LLVM_DEBUG(dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+ dbgs() << "\n";);
-#define SHOWBLK_DETAIL(b, msg) \
-DEBUG( \
- if (b) { \
- dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
- b->print(dbgs()); \
- dbgs() << "\n"; \
- } \
-);
+#define SHOWBLK_DETAIL(b, msg) \
+ LLVM_DEBUG(if (b) { \
+ dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+ b->print(dbgs()); \
+ dbgs() << "\n"; \
+ });
#define INVALIDSCCNUM -1
@@ -158,19 +154,19 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override {
TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
TRI = &TII->getRegisterInfo();
- DEBUG(MF.dump(););
+ LLVM_DEBUG(MF.dump(););
OrderedBlks.clear();
Visited.clear();
FuncRep = &MF;
MLI = &getAnalysis<MachineLoopInfo>();
- DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
+ LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
MDT = &getAnalysis<MachineDominatorTree>();
- DEBUG(MDT->print(dbgs(), (const Module*)nullptr););
+ LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr););
PDT = &getAnalysis<MachinePostDominatorTree>();
- DEBUG(PDT->print(dbgs()););
+ LLVM_DEBUG(PDT->print(dbgs()););
prepare();
run();
- DEBUG(MF.dump(););
+ LLVM_DEBUG(MF.dump(););
return true;
}
@@ -436,19 +432,19 @@ void AMDGPUCFGStructurizer::reversePredicateSetter(
for (;; --I) {
if (I == MBB.end())
continue;
- if (I->getOpcode() == AMDGPU::PRED_X) {
+ if (I->getOpcode() == R600::PRED_X) {
switch (I->getOperand(2).getImm()) {
- case AMDGPU::PRED_SETE_INT:
- I->getOperand(2).setImm(AMDGPU::PRED_SETNE_INT);
+ case R600::PRED_SETE_INT:
+ I->getOperand(2).setImm(R600::PRED_SETNE_INT);
return;
- case AMDGPU::PRED_SETNE_INT:
- I->getOperand(2).setImm(AMDGPU::PRED_SETE_INT);
+ case R600::PRED_SETNE_INT:
+ I->getOperand(2).setImm(R600::PRED_SETE_INT);
return;
- case AMDGPU::PRED_SETE:
- I->getOperand(2).setImm(AMDGPU::PRED_SETNE);
+ case R600::PRED_SETE:
+ I->getOperand(2).setImm(R600::PRED_SETNE);
return;
- case AMDGPU::PRED_SETNE:
- I->getOperand(2).setImm(AMDGPU::PRED_SETE);
+ case R600::PRED_SETNE:
+ I->getOperand(2).setImm(R600::PRED_SETE);
return;
default:
llvm_unreachable("PRED_X Opcode invalid!");
@@ -517,10 +513,10 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(
int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
switch(OldOpcode) {
- case AMDGPU::JUMP_COND:
- case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
- case AMDGPU::BRANCH_COND_i32:
- case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
+ case R600::JUMP_COND:
+ case R600::JUMP: return R600::IF_PREDICATE_SET;
+ case R600::BRANCH_COND_i32:
+ case R600::BRANCH_COND_f32: return R600::IF_LOGICALNZ_f32;
default: llvm_unreachable("internal error");
}
return -1;
@@ -528,10 +524,10 @@ int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
switch(OldOpcode) {
- case AMDGPU::JUMP_COND:
- case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
- case AMDGPU::BRANCH_COND_i32:
- case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
+ case R600::JUMP_COND:
+ case R600::JUMP: return R600::IF_PREDICATE_SET;
+ case R600::BRANCH_COND_i32:
+ case R600::BRANCH_COND_f32: return R600::IF_LOGICALZ_f32;
default: llvm_unreachable("internal error");
}
return -1;
@@ -539,8 +535,8 @@ int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
switch(OldOpcode) {
- case AMDGPU::JUMP_COND:
- case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
+ case R600::JUMP_COND:
+ case R600::JUMP: return R600::CONTINUE_LOGICALNZ_i32;
default: llvm_unreachable("internal error");
}
return -1;
@@ -548,8 +544,8 @@ int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
switch(OldOpcode) {
- case AMDGPU::JUMP_COND:
- case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
+ case R600::JUMP_COND:
+ case R600::JUMP: return R600::CONTINUE_LOGICALZ_i32;
default: llvm_unreachable("internal error");
}
return -1;
@@ -577,9 +573,9 @@ AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
switch (MI->getOpcode()) {
- case AMDGPU::JUMP_COND:
- case AMDGPU::BRANCH_COND_i32:
- case AMDGPU::BRANCH_COND_f32: return true;
+ case R600::JUMP_COND:
+ case R600::BRANCH_COND_i32:
+ case R600::BRANCH_COND_f32: return true;
default:
return false;
}
@@ -588,8 +584,8 @@ bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
switch (MI->getOpcode()) {
- case AMDGPU::JUMP:
- case AMDGPU::BRANCH:
+ case R600::JUMP:
+ case R600::BRANCH:
return true;
default:
return false;
@@ -638,7 +634,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
MachineBasicBlock::reverse_iterator It = MBB->rbegin();
if (It != MBB->rend()) {
MachineInstr *instr = &(*It);
- if (instr->getOpcode() == AMDGPU::RETURN)
+ if (instr->getOpcode() == R600::RETURN)
return instr;
}
return nullptr;
@@ -650,9 +646,8 @@ bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
if (MI)
assert(IsReturn);
else if (IsReturn)
- DEBUG(
- dbgs() << "BB" << MBB->getNumber()
- <<" is return block without RETURN instr\n";);
+ LLVM_DEBUG(dbgs() << "BB" << MBB->getNumber()
+ << " is return block without RETURN instr\n";);
return IsReturn;
}
@@ -692,8 +687,8 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator It = Pre;
while (It != E) {
- if (Pre->getOpcode() == AMDGPU::CONTINUE
- && It->getOpcode() == AMDGPU::ENDLOOP)
+ if (Pre->getOpcode() == R600::CONTINUE
+ && It->getOpcode() == R600::ENDLOOP)
ContInstr.push_back(&*Pre);
Pre = It;
++It;
@@ -714,7 +709,7 @@ bool AMDGPUCFGStructurizer::prepare() {
//FIXME: if not reducible flow graph, make it so ???
- DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
+ LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
orderBlocks(FuncRep);
@@ -757,14 +752,14 @@ bool AMDGPUCFGStructurizer::prepare() {
bool AMDGPUCFGStructurizer::run() {
//Assume reducible CFG...
- DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
+ LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
#ifdef STRESSTEST
//Use the worse block ordering to test the algorithm.
ReverseVector(orderedBlks);
#endif
- DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
+ LLVM_DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
int NumIter = 0;
bool Finish = false;
MachineBasicBlock *MBB;
@@ -774,10 +769,8 @@ bool AMDGPUCFGStructurizer::run() {
do {
++NumIter;
- DEBUG(
- dbgs() << "numIter = " << NumIter
- << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
- );
+ LLVM_DEBUG(dbgs() << "numIter = " << NumIter
+ << ", numRemaintedBlk = " << NumRemainedBlk << "\n";);
SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
OrderedBlks.begin();
@@ -799,10 +792,8 @@ bool AMDGPUCFGStructurizer::run() {
SccBeginMBB = MBB;
SccNumIter = 0;
SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
- DEBUG(
- dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
- dbgs() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
+ dbgs() << "\n";);
}
if (!isRetiredBlock(MBB))
@@ -817,20 +808,16 @@ bool AMDGPUCFGStructurizer::run() {
++SccNumIter;
int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
- DEBUG(
- dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
- << ", sccNumIter = " << SccNumIter;
- dbgs() << "doesn't make any progress\n";
- );
+ LLVM_DEBUG(dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
+ << ", sccNumIter = " << SccNumIter;
+ dbgs() << "doesn't make any progress\n";);
ContNextScc = true;
} else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
SccNumBlk = sccRemainedNumBlk;
It = SccBeginIter;
ContNextScc = false;
- DEBUG(
- dbgs() << "repeat processing SCC" << getSCCNum(MBB)
- << "sccNumIter = " << SccNumIter << '\n';
- );
+ LLVM_DEBUG(dbgs() << "repeat processing SCC" << getSCCNum(MBB)
+ << "sccNumIter = " << SccNumIter << '\n';);
} else {
// Finish the current scc.
ContNextScc = true;
@@ -848,9 +835,7 @@ bool AMDGPUCFGStructurizer::run() {
*GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
if (EntryMBB->succ_size() == 0) {
Finish = true;
- DEBUG(
- dbgs() << "Reduce to one block\n";
- );
+ LLVM_DEBUG(dbgs() << "Reduce to one block\n";);
} else {
int NewnumRemainedBlk
= countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
@@ -860,9 +845,7 @@ bool AMDGPUCFGStructurizer::run() {
NumRemainedBlk = NewnumRemainedBlk;
} else {
MakeProgress = false;
- DEBUG(
- dbgs() << "No progress\n";
- );
+ LLVM_DEBUG(dbgs() << "No progress\n";);
}
}
} while (!Finish && MakeProgress);
@@ -875,9 +858,7 @@ bool AMDGPUCFGStructurizer::run() {
It != E; ++It) {
if ((*It).second && (*It).second->IsRetired) {
assert(((*It).first)->getNumber() != -1);
- DEBUG(
- dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";);
(*It).first->eraseFromParent(); //Remove from the parent Function.
}
delete (*It).second;
@@ -886,7 +867,7 @@ bool AMDGPUCFGStructurizer::run() {
LLInfoMap.clear();
if (!Finish) {
- DEBUG(FuncRep->viewCFG());
+ LLVM_DEBUG(FuncRep->viewCFG());
report_fatal_error("IRREDUCIBLE_CFG");
}
@@ -920,17 +901,13 @@ int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
int NumMatch = 0;
int CurMatch;
- DEBUG(
- dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";);
while ((CurMatch = patternMatchGroup(MBB)) > 0)
NumMatch += CurMatch;
- DEBUG(
- dbgs() << "End patternMatch BB" << MBB->getNumber()
- << ", numMatch = " << NumMatch << "\n";
- );
+ LLVM_DEBUG(dbgs() << "End patternMatch BB" << MBB->getNumber()
+ << ", numMatch = " << NumMatch << "\n";);
return NumMatch;
}
@@ -1050,7 +1027,7 @@ int AMDGPUCFGStructurizer::loopendPatternMatch() {
for (MachineLoop *ExaminedLoop : NestedLoops) {
if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
continue;
- DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
+ LLVM_DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
int NumBreak = mergeLoop(ExaminedLoop);
if (NumBreak == -1)
break;
@@ -1064,7 +1041,8 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
MBBVector ExitingMBBs;
LoopRep->getExitingBlocks(ExitingMBBs);
assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
- DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
+ LLVM_DEBUG(dbgs() << "Loop has " << ExitingMBBs.size()
+ << " exiting blocks\n";);
// We assume a single ExitBlk
MBBVector ExitBlks;
LoopRep->getExitBlocks(ExitBlks);
@@ -1106,11 +1084,9 @@ bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
if (TheEntry) {
- DEBUG(
- dbgs() << "isLoopContBreakBlock yes src1 = BB"
- << Src1MBB->getNumber()
- << " src2 = BB" << Src2MBB->getNumber() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "isLoopContBreakBlock yes src1 = BB"
+ << Src1MBB->getNumber() << " src2 = BB"
+ << Src2MBB->getNumber() << "\n";);
return true;
}
}
@@ -1122,9 +1098,8 @@ int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
if (Num == 0) {
- DEBUG(
- dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
- );
+ LLVM_DEBUG(dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk"
+ << "\n";);
Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
}
return Num;
@@ -1138,22 +1113,16 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
//trueBlk could be the common post dominator
DownBlk = TrueMBB;
- DEBUG(
- dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
- << " true = BB" << TrueMBB->getNumber()
- << ", numSucc=" << TrueMBB->succ_size()
- << " false = BB" << FalseMBB->getNumber() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
+ << " true = BB" << TrueMBB->getNumber()
+ << ", numSucc=" << TrueMBB->succ_size() << " false = BB"
+ << FalseMBB->getNumber() << "\n";);
while (DownBlk) {
- DEBUG(
- dbgs() << "check down = BB" << DownBlk->getNumber();
- );
+ LLVM_DEBUG(dbgs() << "check down = BB" << DownBlk->getNumber(););
if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
- DEBUG(
- dbgs() << " working\n";
- );
+ LLVM_DEBUG(dbgs() << " working\n";);
Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
@@ -1166,9 +1135,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
break;
}
- DEBUG(
- dbgs() << " not working\n";
- );
+ LLVM_DEBUG(dbgs() << " not working\n";);
DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
} // walk down the postDomTree
@@ -1247,10 +1214,9 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
MigrateFalse = true;
- DEBUG(
- dbgs() << "before improveSimpleJumpintoIf: ";
- showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
- );
+ LLVM_DEBUG(
+ dbgs() << "before improveSimpleJumpintoIf: ";
+ showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
// org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
//
@@ -1337,15 +1303,15 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
- //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
- MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
+ //insert R600::ENDIF to avoid special case "input landBlk == NULL"
+ MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, R600::ENDIF);
if (LandBlkHasOtherPred) {
report_fatal_error("Extra register needed to handle CFG");
unsigned CmpResReg =
HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
report_fatal_error("Extra compare instruction needed to handle CFG");
- insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
+ insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET,
CmpResReg, DebugLoc());
}
@@ -1353,7 +1319,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
// cause an assertion failure in the PostRA scheduling pass.
unsigned InitReg =
HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
- insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
+ insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg,
DebugLoc());
if (MigrateTrue) {
@@ -1363,7 +1329,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
// (initVal != 1).
report_fatal_error("Extra register needed to handle CFG");
}
- insertInstrBefore(I, AMDGPU::ELSE);
+ insertInstrBefore(I, R600::ELSE);
if (MigrateFalse) {
migrateInstruction(FalseMBB, LandBlk, I);
@@ -1375,7 +1341,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
if (LandBlkHasOtherPred) {
// add endif
- insertInstrBefore(I, AMDGPU::ENDIF);
+ insertInstrBefore(I, R600::ENDIF);
// put initReg = 2 to other predecessors of landBlk
for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
@@ -1385,10 +1351,9 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
report_fatal_error("Extra register needed to handle CFG");
}
}
- DEBUG(
- dbgs() << "result from improveSimpleJumpintoIf: ";
- showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
- );
+ LLVM_DEBUG(
+ dbgs() << "result from improveSimpleJumpintoIf: ";
+ showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
// update landBlk
*LandMBBPtr = LandBlk;
@@ -1398,10 +1363,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
MachineBasicBlock *SrcMBB) {
- DEBUG(
- dbgs() << "serialPattern BB" << DstMBB->getNumber()
- << " <= BB" << SrcMBB->getNumber() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "serialPattern BB" << DstMBB->getNumber() << " <= BB"
+ << SrcMBB->getNumber() << "\n";);
DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
DstMBB->removeSuccessor(SrcMBB, true);
@@ -1416,26 +1379,15 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
assert (TrueMBB);
- DEBUG(
- dbgs() << "ifPattern BB" << MBB->getNumber();
- dbgs() << "{ ";
- if (TrueMBB) {
- dbgs() << "BB" << TrueMBB->getNumber();
- }
- dbgs() << " } else ";
- dbgs() << "{ ";
- if (FalseMBB) {
- dbgs() << "BB" << FalseMBB->getNumber();
- }
- dbgs() << " }\n ";
- dbgs() << "landBlock: ";
- if (!LandMBB) {
- dbgs() << "NULL";
- } else {
- dbgs() << "BB" << LandMBB->getNumber();
- }
- dbgs() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "ifPattern BB" << MBB->getNumber(); dbgs() << "{ ";
+ if (TrueMBB) { dbgs() << "BB" << TrueMBB->getNumber(); } dbgs()
+ << " } else ";
+ dbgs() << "{ "; if (FalseMBB) {
+ dbgs() << "BB" << FalseMBB->getNumber();
+ } dbgs() << " }\n ";
+ dbgs() << "landBlock: "; if (!LandMBB) { dbgs() << "NULL"; } else {
+ dbgs() << "BB" << LandMBB->getNumber();
+ } dbgs() << "\n";);
int OldOpcode = BranchMI->getOpcode();
DebugLoc BranchDL = BranchMI->getDebugLoc();
@@ -1462,7 +1414,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
}
if (FalseMBB) {
- insertInstrBefore(I, AMDGPU::ELSE);
+ insertInstrBefore(I, R600::ELSE);
MBB->splice(I, FalseMBB, FalseMBB->begin(),
FalseMBB->end());
MBB->removeSuccessor(FalseMBB, true);
@@ -1471,7 +1423,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
retireBlock(FalseMBB);
MLI->removeBlock(FalseMBB);
}
- insertInstrBefore(I, AMDGPU::ENDIF);
+ insertInstrBefore(I, R600::ENDIF);
BranchMI->eraseFromParent();
@@ -1481,18 +1433,19 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
MachineBasicBlock *LandMBB) {
- DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
- << " land = BB" << LandMBB->getNumber() << "\n";);
+ LLVM_DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
+ << " land = BB" << LandMBB->getNumber() << "\n";);
- insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
- insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
+ insertInstrBefore(DstBlk, R600::WHILELOOP, DebugLoc());
+ insertInstrEnd(DstBlk, R600::ENDLOOP, DebugLoc());
DstBlk->replaceSuccessor(DstBlk, LandMBB);
}
void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
MachineBasicBlock *LandMBB) {
- DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
- << " land = BB" << LandMBB->getNumber() << "\n";);
+ LLVM_DEBUG(dbgs() << "loopbreakPattern exiting = BB"
+ << ExitingMBB->getNumber() << " land = BB"
+ << LandMBB->getNumber() << "\n";);
MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
assert(BranchMI && isCondBranch(BranchMI));
DebugLoc DL = BranchMI->getDebugLoc();
@@ -1500,9 +1453,9 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
MachineBasicBlock::iterator I = BranchMI;
if (TrueBranch != LandMBB)
reversePredicateSetter(I, *I->getParent());
- insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
- insertInstrBefore(I, AMDGPU::BREAK);
- insertInstrBefore(I, AMDGPU::ENDIF);
+ insertCondBranchBefore(ExitingMBB, I, R600::IF_PREDICATE_SET, R600::PREDICATE_BIT, DL);
+ insertInstrBefore(I, R600::BREAK);
+ insertInstrBefore(I, R600::ENDIF);
//now branchInst can be erase safely
BranchMI->eraseFromParent();
//now take care of successors, retire blocks
@@ -1511,9 +1464,9 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
MachineBasicBlock *ContMBB) {
- DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
- << ContingMBB->getNumber()
- << ", cont = BB" << ContMBB->getNumber() << "\n";);
+ LLVM_DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
+ << ContingMBB->getNumber() << ", cont = BB"
+ << ContMBB->getNumber() << "\n";);
MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
if (MI) {
@@ -1531,8 +1484,8 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
getBranchZeroOpcode(OldOpcode);
insertCondBranchBefore(I, BranchOpcode, DL);
// insertEnd to ensure phi-moves, if exist, go before the continue-instr.
- insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
- insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
+ insertInstrEnd(ContingMBB, R600::CONTINUE, DL);
+ insertInstrEnd(ContingMBB, R600::ENDIF, DL);
} else {
int BranchOpcode =
TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
@@ -1547,7 +1500,7 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
// location we've just inserted that reference here so it should be
// representative insertEnd to ensure phi-moves, if exist, go before the
// continue-instr.
- insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
+ insertInstrEnd(ContingMBB, R600::CONTINUE,
getLastDebugLocInBB(ContingMBB));
}
}
@@ -1587,10 +1540,9 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
numClonedInstr += MBB->size();
- DEBUG(
- dbgs() << "Cloned block: " << "BB"
- << MBB->getNumber() << "size " << MBB->size() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "Cloned block: "
+ << "BB" << MBB->getNumber() << "size " << MBB->size()
+ << "\n";);
SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
@@ -1603,26 +1555,22 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
//look for the input branchinstr, not the AMDGPU branchinstr
MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
if (!BranchMI) {
- DEBUG(
- dbgs() << "migrateInstruction don't see branch instr\n";
- );
+ LLVM_DEBUG(dbgs() << "migrateInstruction don't see branch instr\n";);
SpliceEnd = SrcMBB->end();
} else {
- DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
+ LLVM_DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
SpliceEnd = BranchMI;
}
- DEBUG(
- dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
- << "srcSize = " << SrcMBB->size() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "migrateInstruction before splice dstSize = "
+ << DstMBB->size() << "srcSize = " << SrcMBB->size()
+ << "\n";);
//splice insert before insertPos
DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
- DEBUG(
- dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
- << "srcSize = " << SrcMBB->size() << '\n';
- );
+ LLVM_DEBUG(dbgs() << "migrateInstruction after splice dstSize = "
+ << DstMBB->size() << "srcSize = " << SrcMBB->size()
+ << '\n';);
}
MachineBasicBlock *
@@ -1640,7 +1588,7 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
FuncRep->push_back(DummyExitBlk); //insert to function
SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
- DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
+ LLVM_DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
LLVMContext &Ctx = LoopHeader->getParent()->getFunction().getContext();
Ctx.emitError("Extra register needed to handle CFG");
return nullptr;
@@ -1653,7 +1601,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
// test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
while ((BranchMI = getLoopendBlockBranchInstr(MBB))
&& isUncondBranch(BranchMI)) {
- DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
+ LLVM_DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
BranchMI->eraseFromParent();
}
}
@@ -1669,7 +1617,7 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
assert(BranchMI && isCondBranch(BranchMI));
- DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
+ LLVM_DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
BranchMI->eraseFromParent();
SHOWNEWBLK(MBB1, "Removing redundant successor");
MBB->removeSuccessor(MBB1, true);
@@ -1679,7 +1627,7 @@ void AMDGPUCFGStructurizer::addDummyExitBlock(
SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
FuncRep->push_back(DummyExitBlk); //insert to function
- insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
+ insertInstrEnd(DummyExitBlk, R600::RETURN);
for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
E = RetMBB.end(); It != E; ++It) {
@@ -1688,10 +1636,8 @@ void AMDGPUCFGStructurizer::addDummyExitBlock(
if (MI)
MI->eraseFromParent();
MBB->addSuccessor(DummyExitBlk);
- DEBUG(
- dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
- << " successors\n";
- );
+ LLVM_DEBUG(dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
+ << " successors\n";);
}
SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
}
@@ -1710,9 +1656,7 @@ void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
}
void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
- DEBUG(
- dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
- );
+ LLVM_DEBUG(dbgs() << "Retiring BB" << MBB->getNumber() << "\n";);
BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h
index 5d243e949fd3..289642aaa2d0 100644
--- a/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -198,7 +198,7 @@ enum amd_code_property_mask_t {
AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT
};
-/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
+/// The hsa_ext_control_directives_t specifies the values for the HSAIL
/// control directives. These control how the finalizer generates code. This
/// struct is used both as an argument to hsaFinalizeKernel to specify values for
/// the control directives, and is used in HsaKernelCode to record the values of
@@ -551,14 +551,8 @@ typedef struct amd_kernel_code_s {
int64_t kernel_code_prefetch_byte_offset;
uint64_t kernel_code_prefetch_byte_size;
- /// Number of bytes of scratch backing memory required for full
- /// occupancy of target chip. This takes into account the number of
- /// bytes of scratch per work-item, the wavefront size, the maximum
- /// number of wavefronts per CU, and the number of CUs. This is an
- /// upper limit on scratch. If the grid being dispatched is small it
- /// may only need less than this. If the kernel uses no scratch, or
- /// the Finalizer has not computed this value, it must be 0.
- uint64_t max_scratch_backing_memory_byte_size;
+ /// Reserved. Must be 0.
+ uint64_t reserved0;
/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
/// COMPUTE_PGM_RSRC2 registers.
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index ebf656c549ec..31e2885c833d 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -12,6 +12,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "SIDefines.h"
+#include "SIInstrInfo.h"
#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDKernelCodeTUtils.h"
@@ -25,7 +26,6 @@
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
@@ -42,9 +42,11 @@
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/SMLoc.h"
#include "llvm/Support/TargetRegistry.h"
@@ -60,6 +62,7 @@
using namespace llvm;
using namespace llvm::AMDGPU;
+using namespace llvm::amdhsa;
namespace {
@@ -128,6 +131,7 @@ public:
enum ImmTy {
ImmTyNone,
ImmTyGDS,
+ ImmTyLDS,
ImmTyOffen,
ImmTyIdxen,
ImmTyAddr64,
@@ -138,6 +142,7 @@ public:
ImmTyGLC,
ImmTySLC,
ImmTyTFE,
+ ImmTyD16,
ImmTyClampSI,
ImmTyOModSI,
ImmTyDppCtrl,
@@ -267,7 +272,11 @@ public:
return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
}
- bool isSDWARegKind() const;
+ bool isSDWAOperand(MVT type) const;
+ bool isSDWAFP16Operand() const;
+ bool isSDWAFP32Operand() const;
+ bool isSDWAInt16Operand() const;
+ bool isSDWAInt32Operand() const;
bool isImmTy(ImmTy ImmT) const {
return isImm() && Imm.Type == ImmT;
@@ -282,7 +291,7 @@ public:
bool isDMask() const { return isImmTy(ImmTyDMask); }
bool isUNorm() const { return isImmTy(ImmTyUNorm); }
bool isDA() const { return isImmTy(ImmTyDA); }
- bool isR128() const { return isImmTy(ImmTyUNorm); }
+ bool isR128() const { return isImmTy(ImmTyR128); }
bool isLWE() const { return isImmTy(ImmTyLWE); }
bool isOff() const { return isImmTy(ImmTyOff); }
bool isExpTgt() const { return isImmTy(ImmTyExpTgt); }
@@ -298,9 +307,11 @@ public:
bool isOffsetU12() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isUInt<12>(getImm()); }
bool isOffsetS13() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isInt<13>(getImm()); }
bool isGDS() const { return isImmTy(ImmTyGDS); }
+ bool isLDS() const { return isImmTy(ImmTyLDS); }
bool isGLC() const { return isImmTy(ImmTyGLC); }
bool isSLC() const { return isImmTy(ImmTySLC); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
+ bool isD16() const { return isImmTy(ImmTyD16); }
bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
@@ -434,7 +445,7 @@ public:
}
bool isVSrcB32() const {
- return isVCSrcF32() || isLiteralImm(MVT::i32);
+ return isVCSrcF32() || isLiteralImm(MVT::i32) || isExpr();
}
bool isVSrcB64() const {
@@ -451,7 +462,7 @@ public:
}
bool isVSrcF32() const {
- return isVCSrcF32() || isLiteralImm(MVT::f32);
+ return isVCSrcF32() || isLiteralImm(MVT::f32) || isExpr();
}
bool isVSrcF64() const {
@@ -643,6 +654,7 @@ public:
switch (Type) {
case ImmTyNone: OS << "None"; break;
case ImmTyGDS: OS << "GDS"; break;
+ case ImmTyLDS: OS << "LDS"; break;
case ImmTyOffen: OS << "Offen"; break;
case ImmTyIdxen: OS << "Idxen"; break;
case ImmTyAddr64: OS << "Addr64"; break;
@@ -653,6 +665,7 @@ public:
case ImmTyGLC: OS << "GLC"; break;
case ImmTySLC: OS << "SLC"; break;
case ImmTyTFE: OS << "TFE"; break;
+ case ImmTyD16: OS << "D16"; break;
case ImmTyDFMT: OS << "DFMT"; break;
case ImmTyNFMT: OS << "NFMT"; break;
case ImmTyClampSI: OS << "ClampSI"; break;
@@ -815,6 +828,10 @@ public:
class AMDGPUAsmParser : public MCTargetAsmParser {
MCAsmParser &Parser;
+ // Number of extra operands parsed after the first optional operand.
+ // This may be necessary to skip hardcoded mandatory operands.
+ static const unsigned MAX_OPR_LOOKAHEAD = 8;
+
unsigned ForcedEncodingSize = 0;
bool ForcedDPP = false;
bool ForcedSDWA = false;
@@ -830,6 +847,27 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
private:
bool ParseAsAbsoluteExpression(uint32_t &Ret);
+ bool OutOfRangeError(SMRange Range);
+ /// Calculate VGPR/SGPR blocks required for given target, reserved
+ /// registers, and user-specified NextFreeXGPR values.
+ ///
+ /// \param Features [in] Target features, used for bug corrections.
+ /// \param VCCUsed [in] Whether VCC special SGPR is reserved.
+ /// \param FlatScrUsed [in] Whether FLAT_SCRATCH special SGPR is reserved.
+ /// \param XNACKUsed [in] Whether XNACK_MASK special SGPR is reserved.
+ /// \param NextFreeVGPR [in] Max VGPR number referenced, plus one.
+ /// \param VGPRRange [in] Token range, used for VGPR diagnostics.
+ /// \param NextFreeSGPR [in] Max SGPR number referenced, plus one.
+ /// \param SGPRRange [in] Token range, used for SGPR diagnostics.
+ /// \param VGPRBlocks [out] Result VGPR block count.
+ /// \param SGPRBlocks [out] Result SGPR block count.
+ bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed,
+ bool FlatScrUsed, bool XNACKUsed,
+ unsigned NextFreeVGPR, SMRange VGPRRange,
+ unsigned NextFreeSGPR, SMRange SGPRRange,
+ unsigned &VGPRBlocks, unsigned &SGPRBlocks);
+ bool ParseDirectiveAMDGCNTarget();
+ bool ParseDirectiveAMDHSAKernel();
bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
bool ParseDirectiveHSACodeObjectVersion();
bool ParseDirectiveHSACodeObjectISA();
@@ -848,8 +886,12 @@ private:
bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
unsigned& RegNum, unsigned& RegWidth,
unsigned *DwordRegIndex);
+ Optional<StringRef> getGprCountSymbolName(RegisterKind RegKind);
+ void initializeGprCountSymbol(RegisterKind RegKind);
+ bool updateGprCountSymbols(RegisterKind RegKind, unsigned DwordRegIndex,
+ unsigned RegWidth);
void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands,
- bool IsAtomic, bool IsAtomicReturn);
+ bool IsAtomic, bool IsAtomicReturn, bool IsLds = false);
void cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
bool IsGdsHardcoded);
@@ -881,15 +923,37 @@ public:
AMDGPU::IsaInfo::IsaVersion ISA =
AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
MCContext &Ctx = getContext();
- MCSymbol *Sym =
- Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
- Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
- Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
+ if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+ MCSymbol *Sym =
+ Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
+ Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
+ } else {
+ MCSymbol *Sym =
+ Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
+ Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
+ Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
+ Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
+ Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
+ Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
+ }
+ if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+ initializeGprCountSymbol(IS_VGPR);
+ initializeGprCountSymbol(IS_SGPR);
+ } else
+ KernelScope.initialize(getContext());
}
- KernelScope.initialize(getContext());
+ }
+
+ bool hasXNACK() const {
+ return AMDGPU::hasXNACK(getSTI());
+ }
+
+ bool hasMIMG_R128() const {
+ return AMDGPU::hasMIMG_R128(getSTI());
+ }
+
+ bool hasPackedD16() const {
+ return AMDGPU::hasPackedD16(getSTI());
}
bool isSI() const {
@@ -1025,6 +1089,11 @@ private:
bool validateConstantBusLimitations(const MCInst &Inst);
bool validateEarlyClobberLimitations(const MCInst &Inst);
bool validateIntClampSupported(const MCInst &Inst);
+ bool validateMIMGAtomicDMask(const MCInst &Inst);
+ bool validateMIMGGatherDMask(const MCInst &Inst);
+ bool validateMIMGDataSize(const MCInst &Inst);
+ bool validateMIMGR128(const MCInst &Inst);
+ bool validateMIMGD16(const MCInst &Inst);
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
@@ -1037,6 +1106,7 @@ private:
public:
OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
+ OperandMatchResultTy parseOptionalOpr(OperandVector &Operands);
OperandMatchResultTy parseExpTgt(OperandVector &Operands);
OperandMatchResultTy parseSendMsgOp(OperandVector &Operands);
@@ -1060,17 +1130,12 @@ public:
void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+ void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false, true); }
void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
AMDGPUOperand::Ptr defaultGLC() const;
AMDGPUOperand::Ptr defaultSLC() const;
- AMDGPUOperand::Ptr defaultTFE() const;
- AMDGPUOperand::Ptr defaultDMask() const;
- AMDGPUOperand::Ptr defaultUNorm() const;
- AMDGPUOperand::Ptr defaultDA() const;
- AMDGPUOperand::Ptr defaultR128() const;
- AMDGPUOperand::Ptr defaultLWE() const;
AMDGPUOperand::Ptr defaultSMRDOffset8() const;
AMDGPUOperand::Ptr defaultSMRDOffset20() const;
AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
@@ -1276,15 +1341,31 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
}
-bool AMDGPUOperand::isSDWARegKind() const {
+bool AMDGPUOperand::isSDWAOperand(MVT type) const {
if (AsmParser->isVI())
return isVReg();
else if (AsmParser->isGFX9())
- return isRegKind();
+ return isRegKind() || isInlinableImm(type);
else
return false;
}
+bool AMDGPUOperand::isSDWAFP16Operand() const {
+ return isSDWAOperand(MVT::f16);
+}
+
+bool AMDGPUOperand::isSDWAFP32Operand() const {
+ return isSDWAOperand(MVT::f32);
+}
+
+bool AMDGPUOperand::isSDWAInt16Operand() const {
+ return isSDWAOperand(MVT::i16);
+}
+
+bool AMDGPUOperand::isSDWAInt32Operand() const {
+ return isSDWAOperand(MVT::i32);
+}
+
uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
{
assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
@@ -1516,12 +1597,15 @@ static unsigned getSpecialRegForName(StringRef RegName) {
.Case("exec", AMDGPU::EXEC)
.Case("vcc", AMDGPU::VCC)
.Case("flat_scratch", AMDGPU::FLAT_SCR)
+ .Case("xnack_mask", AMDGPU::XNACK_MASK)
.Case("m0", AMDGPU::M0)
.Case("scc", AMDGPU::SCC)
.Case("tba", AMDGPU::TBA)
.Case("tma", AMDGPU::TMA)
.Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
.Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
+ .Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO)
+ .Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI)
.Case("vcc_lo", AMDGPU::VCC_LO)
.Case("vcc_hi", AMDGPU::VCC_HI)
.Case("exec_lo", AMDGPU::EXEC_LO)
@@ -1559,6 +1643,11 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
RegWidth = 2;
return true;
}
+ if (Reg == AMDGPU::XNACK_MASK_LO && Reg1 == AMDGPU::XNACK_MASK_HI) {
+ Reg = AMDGPU::XNACK_MASK;
+ RegWidth = 2;
+ return true;
+ }
if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) {
Reg = AMDGPU::VCC;
RegWidth = 2;
@@ -1717,6 +1806,54 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
return true;
}
+Optional<StringRef>
+AMDGPUAsmParser::getGprCountSymbolName(RegisterKind RegKind) {
+ switch (RegKind) {
+ case IS_VGPR:
+ return StringRef(".amdgcn.next_free_vgpr");
+ case IS_SGPR:
+ return StringRef(".amdgcn.next_free_sgpr");
+ default:
+ return None;
+ }
+}
+
+void AMDGPUAsmParser::initializeGprCountSymbol(RegisterKind RegKind) {
+ auto SymbolName = getGprCountSymbolName(RegKind);
+ assert(SymbolName && "initializing invalid register kind");
+ MCSymbol *Sym = getContext().getOrCreateSymbol(*SymbolName);
+ Sym->setVariableValue(MCConstantExpr::create(0, getContext()));
+}
+
+bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
+ unsigned DwordRegIndex,
+ unsigned RegWidth) {
+ // Symbols are only defined for GCN targets
+ if (AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()).Major < 6)
+ return true;
+
+ auto SymbolName = getGprCountSymbolName(RegKind);
+ if (!SymbolName)
+ return true;
+ MCSymbol *Sym = getContext().getOrCreateSymbol(*SymbolName);
+
+ int64_t NewMax = DwordRegIndex + RegWidth - 1;
+ int64_t OldCount;
+
+ if (!Sym->isVariable())
+ return !Error(getParser().getTok().getLoc(),
+ ".amdgcn.next_free_{v,s}gpr symbols must be variable");
+ if (!Sym->getVariableValue(false)->evaluateAsAbsolute(OldCount))
+ return !Error(
+ getParser().getTok().getLoc(),
+ ".amdgcn.next_free_{v,s}gpr symbols must be absolute expressions");
+
+ if (OldCount <= NewMax)
+ Sym->setVariableValue(MCConstantExpr::create(NewMax + 1, getContext()));
+
+ return true;
+}
+
std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
const auto &Tok = Parser.getTok();
SMLoc StartLoc = Tok.getLoc();
@@ -1727,7 +1864,11 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) {
return nullptr;
}
- KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
+ if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+ if (!updateGprCountSymbols(RegKind, DwordRegIndex, RegWidth))
+ return nullptr;
+ } else
+ KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false);
}
@@ -2234,6 +2375,111 @@ bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) {
return true;
}
+bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
+
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ return true;
+
+ int VDataIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+ int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
+ int TFEIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::tfe);
+
+ assert(VDataIdx != -1);
+ assert(DMaskIdx != -1);
+ assert(TFEIdx != -1);
+
+ unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx);
+ unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0;
+ unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
+ if (DMask == 0)
+ DMask = 1;
+
+ unsigned DataSize =
+ (Desc.TSFlags & SIInstrFlags::Gather4) ? 4 : countPopulation(DMask);
+ if (hasPackedD16()) {
+ int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16);
+ if (D16Idx >= 0 && Inst.getOperand(D16Idx).getImm())
+ DataSize = (DataSize + 1) / 2;
+ }
+
+ return (VDataSize / 4) == DataSize + TFESize;
+}
+
+bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) {
+
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ return true;
+ if (!Desc.mayLoad() || !Desc.mayStore())
+ return true; // Not atomic
+
+ int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
+ unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
+
+ // This is an incomplete check because image_atomic_cmpswap
+ // may only use 0x3 and 0xf while other atomic operations
+ // may use 0x1 and 0x3. However these limitations are
+ // verified when we check that dmask matches dst size.
+ return DMask == 0x1 || DMask == 0x3 || DMask == 0xf;
+}
+
+bool AMDGPUAsmParser::validateMIMGGatherDMask(const MCInst &Inst) {
+
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::Gather4) == 0)
+ return true;
+
+ int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
+ unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
+
+ // GATHER4 instructions use dmask in a different fashion compared to
+ // other MIMG instructions. The only useful DMASK values are
+ // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+ // (red,red,red,red) etc.) The ISA document doesn't mention
+ // this.
+ return DMask == 0x1 || DMask == 0x2 || DMask == 0x4 || DMask == 0x8;
+}
+
+bool AMDGPUAsmParser::validateMIMGR128(const MCInst &Inst) {
+
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ return true;
+
+ int Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128);
+ assert(Idx != -1);
+
+ bool R128 = (Inst.getOperand(Idx).getImm() != 0);
+
+ return !R128 || hasMIMG_R128();
+}
+
+bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
+
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ return true;
+
+ int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16);
+ if (D16Idx >= 0 && Inst.getOperand(D16Idx).getImm()) {
+ if (isCI() || isSI())
+ return false;
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
const SMLoc &IDLoc) {
if (!validateConstantBusLimitations(Inst)) {
@@ -2251,6 +2497,32 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"integer clamping is not supported on this GPU");
return false;
}
+ if (!validateMIMGR128(Inst)) {
+ Error(IDLoc,
+ "r128 modifier is not supported on this GPU");
+ return false;
+ }
+ // For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
+ if (!validateMIMGD16(Inst)) {
+ Error(IDLoc,
+ "d16 modifier is not supported on this GPU");
+ return false;
+ }
+ if (!validateMIMGDataSize(Inst)) {
+ Error(IDLoc,
+ "image data size does not match dmask and tfe");
+ return false;
+ }
+ if (!validateMIMGAtomicDMask(Inst)) {
+ Error(IDLoc,
+ "invalid atomic image dmask");
+ return false;
+ }
+ if (!validateMIMGGatherDMask(Inst)) {
+ Error(IDLoc,
+ "invalid image_gather dmask: only one bit must be set");
+ return false;
+ }
return true;
}
@@ -2355,6 +2627,320 @@ bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
return false;
}
+bool AMDGPUAsmParser::ParseDirectiveAMDGCNTarget() {
+ if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
+ return TokError("directive only supported for amdgcn architecture");
+
+ std::string Target;
+
+ SMLoc TargetStart = getTok().getLoc();
+ if (getParser().parseEscapedString(Target))
+ return true;
+ SMRange TargetRange = SMRange(TargetStart, getTok().getLoc());
+
+ std::string ExpectedTarget;
+ raw_string_ostream ExpectedTargetOS(ExpectedTarget);
+ IsaInfo::streamIsaVersion(&getSTI(), ExpectedTargetOS);
+
+ if (Target != ExpectedTargetOS.str())
+ return getParser().Error(TargetRange.Start, "target must match options",
+ TargetRange);
+
+ getTargetStreamer().EmitDirectiveAMDGCNTarget(Target);
+ return false;
+}
+
+bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) {
+ return getParser().Error(Range.Start, "value out of range", Range);
+}
+
+bool AMDGPUAsmParser::calculateGPRBlocks(
+ const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed,
+ bool XNACKUsed, unsigned NextFreeVGPR, SMRange VGPRRange,
+ unsigned NextFreeSGPR, SMRange SGPRRange, unsigned &VGPRBlocks,
+ unsigned &SGPRBlocks) {
+ // TODO(scott.linder): These calculations are duplicated from
+ // AMDGPUAsmPrinter::getSIProgramInfo and could be unified.
+ IsaInfo::IsaVersion Version = IsaInfo::getIsaVersion(Features);
+
+ unsigned NumVGPRs = NextFreeVGPR;
+ unsigned NumSGPRs = NextFreeSGPR;
+ unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(Features);
+
+ if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) &&
+ NumSGPRs > MaxAddressableNumSGPRs)
+ return OutOfRangeError(SGPRRange);
+
+ NumSGPRs +=
+ IsaInfo::getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed, XNACKUsed);
+
+ if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
+ NumSGPRs > MaxAddressableNumSGPRs)
+ return OutOfRangeError(SGPRRange);
+
+ if (Features.test(FeatureSGPRInitBug))
+ NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+
+ VGPRBlocks = IsaInfo::getNumVGPRBlocks(Features, NumVGPRs);
+ SGPRBlocks = IsaInfo::getNumSGPRBlocks(Features, NumSGPRs);
+
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
+ if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
+ return TokError("directive only supported for amdgcn architecture");
+
+ if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA)
+ return TokError("directive only supported for amdhsa OS");
+
+ StringRef KernelName;
+ if (getParser().parseIdentifier(KernelName))
+ return true;
+
+ kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor();
+
+ StringSet<> Seen;
+
+ IsaInfo::IsaVersion IVersion =
+ IsaInfo::getIsaVersion(getSTI().getFeatureBits());
+
+ SMRange VGPRRange;
+ uint64_t NextFreeVGPR = 0;
+ SMRange SGPRRange;
+ uint64_t NextFreeSGPR = 0;
+ unsigned UserSGPRCount = 0;
+ bool ReserveVCC = true;
+ bool ReserveFlatScr = true;
+ bool ReserveXNACK = hasXNACK();
+
+ while (true) {
+ while (getLexer().is(AsmToken::EndOfStatement))
+ Lex();
+
+ if (getLexer().isNot(AsmToken::Identifier))
+ return TokError("expected .amdhsa_ directive or .end_amdhsa_kernel");
+
+ StringRef ID = getTok().getIdentifier();
+ SMRange IDRange = getTok().getLocRange();
+ Lex();
+
+ if (ID == ".end_amdhsa_kernel")
+ break;
+
+ if (Seen.find(ID) != Seen.end())
+ return TokError(".amdhsa_ directives cannot be repeated");
+ Seen.insert(ID);
+
+ SMLoc ValStart = getTok().getLoc();
+ int64_t IVal;
+ if (getParser().parseAbsoluteExpression(IVal))
+ return true;
+ SMLoc ValEnd = getTok().getLoc();
+ SMRange ValRange = SMRange(ValStart, ValEnd);
+
+ if (IVal < 0)
+ return OutOfRangeError(ValRange);
+
+ uint64_t Val = IVal;
+
+#define PARSE_BITS_ENTRY(FIELD, ENTRY, VALUE, RANGE) \
+ if (!isUInt<ENTRY##_WIDTH>(VALUE)) \
+ return OutOfRangeError(RANGE); \
+ AMDHSA_BITS_SET(FIELD, ENTRY, VALUE);
+
+ if (ID == ".amdhsa_group_segment_fixed_size") {
+ if (!isUInt<sizeof(KD.group_segment_fixed_size) * CHAR_BIT>(Val))
+ return OutOfRangeError(ValRange);
+ KD.group_segment_fixed_size = Val;
+ } else if (ID == ".amdhsa_private_segment_fixed_size") {
+ if (!isUInt<sizeof(KD.private_segment_fixed_size) * CHAR_BIT>(Val))
+ return OutOfRangeError(ValRange);
+ KD.private_segment_fixed_size = Val;
+ } else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
+ Val, ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
+ ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_user_sgpr_queue_ptr") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val,
+ ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
+ Val, ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_user_sgpr_dispatch_id") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val,
+ ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
+ ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_user_sgpr_private_segment_size") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
+ Val, ValRange);
+ UserSGPRCount++;
+ } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
+ PARSE_BITS_ENTRY(
+ KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_system_sgpr_workgroup_id_y") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_system_sgpr_workgroup_id_z") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_system_sgpr_workgroup_info") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_system_vgpr_workitem_id") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_next_free_vgpr") {
+ VGPRRange = ValRange;
+ NextFreeVGPR = Val;
+ } else if (ID == ".amdhsa_next_free_sgpr") {
+ SGPRRange = ValRange;
+ NextFreeSGPR = Val;
+ } else if (ID == ".amdhsa_reserve_vcc") {
+ if (!isUInt<1>(Val))
+ return OutOfRangeError(ValRange);
+ ReserveVCC = Val;
+ } else if (ID == ".amdhsa_reserve_flat_scratch") {
+ if (IVersion.Major < 7)
+ return getParser().Error(IDRange.Start, "directive requires gfx7+",
+ IDRange);
+ if (!isUInt<1>(Val))
+ return OutOfRangeError(ValRange);
+ ReserveFlatScr = Val;
+ } else if (ID == ".amdhsa_reserve_xnack_mask") {
+ if (IVersion.Major < 8)
+ return getParser().Error(IDRange.Start, "directive requires gfx8+",
+ IDRange);
+ if (!isUInt<1>(Val))
+ return OutOfRangeError(ValRange);
+ ReserveXNACK = Val;
+ } else if (ID == ".amdhsa_float_round_mode_32") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32, Val, ValRange);
+ } else if (ID == ".amdhsa_float_round_mode_16_64") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64, Val, ValRange);
+ } else if (ID == ".amdhsa_float_denorm_mode_32") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32, Val, ValRange);
+ } else if (ID == ".amdhsa_float_denorm_mode_16_64") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_dx10_clamp") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, Val, ValRange);
+ } else if (ID == ".amdhsa_ieee_mode") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE,
+ Val, ValRange);
+ } else if (ID == ".amdhsa_fp16_overflow") {
+ if (IVersion.Major < 9)
+ return getParser().Error(IDRange.Start, "directive requires gfx9+",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
+ PARSE_BITS_ENTRY(
+ KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_exception_fp_denorm_src") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE,
+ Val, ValRange);
+ } else if (ID == ".amdhsa_exception_fp_ieee_div_zero") {
+ PARSE_BITS_ENTRY(
+ KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_exception_fp_ieee_overflow") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW,
+ Val, ValRange);
+ } else if (ID == ".amdhsa_exception_fp_ieee_underflow") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW,
+ Val, ValRange);
+ } else if (ID == ".amdhsa_exception_fp_ieee_inexact") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT,
+ Val, ValRange);
+ } else if (ID == ".amdhsa_exception_int_div_zero") {
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO,
+ Val, ValRange);
+ } else {
+ return getParser().Error(IDRange.Start,
+ "unknown .amdhsa_kernel directive", IDRange);
+ }
+
+#undef PARSE_BITS_ENTRY
+ }
+
+ if (Seen.find(".amdhsa_next_free_vgpr") == Seen.end())
+ return TokError(".amdhsa_next_free_vgpr directive is required");
+
+ if (Seen.find(".amdhsa_next_free_sgpr") == Seen.end())
+ return TokError(".amdhsa_next_free_sgpr directive is required");
+
+ unsigned VGPRBlocks;
+ unsigned SGPRBlocks;
+ if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
+ ReserveXNACK, NextFreeVGPR, VGPRRange, NextFreeSGPR,
+ SGPRRange, VGPRBlocks, SGPRBlocks))
+ return true;
+
+ if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
+ VGPRBlocks))
+ return OutOfRangeError(VGPRRange);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, VGPRBlocks);
+
+ if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>(
+ SGPRBlocks))
+ return OutOfRangeError(SGPRRange);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT,
+ SGPRBlocks);
+
+ if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
+ return TokError("too many user SGPRs enabled");
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT,
+ UserSGPRCount);
+
+ getTargetStreamer().EmitAmdhsaKernelDescriptor(
+ getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC,
+ ReserveFlatScr, ReserveXNACK);
+ return false;
+}
+
bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() {
uint32_t Major;
uint32_t Minor;
@@ -2421,6 +3007,13 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
amd_kernel_code_t &Header) {
+ // max_scratch_backing_memory_byte_size is deprecated. Ignore it while parsing
+ // assembly for backwards compatibility.
+ if (ID == "max_scratch_backing_memory_byte_size") {
+ Parser.eatToEndOfStatement();
+ return false;
+ }
+
SmallString<40> ErrStr;
raw_svector_ostream Err(ErrStr);
if (!parseAmdKernelCodeField(ID, getParser(), Header, Err)) {
@@ -2467,7 +3060,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
getTargetStreamer().EmitAMDGPUSymbolType(KernelName,
ELF::STT_AMDGPU_HSA_KERNEL);
Lex();
- KernelScope.initialize(getContext());
+ if (!AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI()))
+ KernelScope.initialize(getContext());
return false;
}
@@ -2571,20 +3165,28 @@ bool AMDGPUAsmParser::ParseDirectivePALMetadata() {
bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getString();
- if (IDVal == ".hsa_code_object_version")
- return ParseDirectiveHSACodeObjectVersion();
+ if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+ if (IDVal == ".amdgcn_target")
+ return ParseDirectiveAMDGCNTarget();
+
+ if (IDVal == ".amdhsa_kernel")
+ return ParseDirectiveAMDHSAKernel();
+ } else {
+ if (IDVal == ".hsa_code_object_version")
+ return ParseDirectiveHSACodeObjectVersion();
- if (IDVal == ".hsa_code_object_isa")
- return ParseDirectiveHSACodeObjectISA();
+ if (IDVal == ".hsa_code_object_isa")
+ return ParseDirectiveHSACodeObjectISA();
- if (IDVal == ".amd_kernel_code_t")
- return ParseDirectiveAMDKernelCodeT();
+ if (IDVal == ".amd_kernel_code_t")
+ return ParseDirectiveAMDKernelCodeT();
- if (IDVal == ".amdgpu_hsa_kernel")
- return ParseDirectiveAMDGPUHsaKernel();
+ if (IDVal == ".amdgpu_hsa_kernel")
+ return ParseDirectiveAMDGPUHsaKernel();
- if (IDVal == ".amd_amdgpu_isa")
- return ParseDirectiveISAVersion();
+ if (IDVal == ".amd_amdgpu_isa")
+ return ParseDirectiveISAVersion();
+ }
if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin)
return ParseDirectiveHSAMetadata();
@@ -2612,6 +3214,10 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
case AMDGPU::TMA_LO:
case AMDGPU::TMA_HI:
return !isGFX9();
+ case AMDGPU::XNACK_MASK:
+ case AMDGPU::XNACK_MASK_LO:
+ case AMDGPU::XNACK_MASK_HI:
+ return !isCI() && !isSI() && hasXNACK();
default:
break;
}
@@ -3158,7 +3764,10 @@ bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset,
HwReg.IsSymbolic = true;
HwReg.Id = ID_UNKNOWN_;
const StringRef tok = Parser.getTok().getString();
- for (int i = ID_SYMBOLIC_FIRST_; i < ID_SYMBOLIC_LAST_; ++i) {
+ int Last = ID_SYMBOLIC_LAST_;
+ if (isSI() || isCI() || isVI())
+ Last = ID_SYMBOLIC_FIRST_GFX9_;
+ for (int i = ID_SYMBOLIC_FIRST_; i < Last; ++i) {
if (tok == IdSymbolic[i]) {
HwReg.Id = i;
break;
@@ -3859,7 +4468,7 @@ AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) {
} else {
// Swizzle "offset" operand is optional.
// If it is omitted, try parsing other optional operands.
- return parseOptionalOperand(Operands);
+ return parseOptionalOpr(Operands);
}
}
@@ -3907,13 +4516,13 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC);
}
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultTFE() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyTFE);
-}
-
void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
const OperandVector &Operands,
- bool IsAtomic, bool IsAtomicReturn) {
+ bool IsAtomic,
+ bool IsAtomicReturn,
+ bool IsLds) {
+ bool IsLdsOpcode = IsLds;
+ bool HasLdsModifier = false;
OptionalImmIndexMap OptionalIdx;
assert(IsAtomicReturn ? IsAtomic : true);
@@ -3932,6 +4541,8 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
continue;
}
+ HasLdsModifier = Op.isLDS();
+
// Handle tokens like 'offen' which are sometimes hard-coded into the
// asm string. There are no MCInst operands for these.
if (Op.isToken()) {
@@ -3943,6 +4554,21 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
OptionalIdx[Op.getImmTy()] = i;
}
+ // This is a workaround for an llvm quirk which may result in an
+ // incorrect instruction selection. Lds and non-lds versions of
+ // MUBUF instructions are identical except that lds versions
+ // have mandatory 'lds' modifier. However this modifier follows
+ // optional modifiers and llvm asm matcher regards this 'lds'
+ // modifier as an optional one. As a result, an lds version
+ // of opcode may be selected even if it has no 'lds' modifier.
+ if (IsLdsOpcode && !HasLdsModifier) {
+ int NoLdsOpcode = AMDGPU::getMUBUFNoLdsInst(Inst.getOpcode());
+ if (NoLdsOpcode != -1) { // Got lds version - correct it.
+ Inst.setOpcode(NoLdsOpcode);
+ IsLdsOpcode = false;
+ }
+ }
+
// Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns.
if (IsAtomicReturn) {
MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning.
@@ -3954,7 +4580,10 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
}
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+
+ if (!IsLdsOpcode) { // tfe is not legal with lds opcodes
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+ }
}
void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
@@ -4009,7 +4638,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
if (IsAtomic) {
// Add src, same as dst
- ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1);
+ assert(Desc.getNumDefs() == 1);
+ ((AMDGPUOperand &)*Operands[I - 1]).addRegOperands(Inst, 1);
}
OptionalImmIndexMap OptionalIdx;
@@ -4018,9 +4648,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
// Add the register arguments
- if (Op.isRegOrImm()) {
- Op.addRegOrImmOperands(Inst, 1);
- continue;
+ if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
} else if (Op.isImmModifier()) {
OptionalIdx[Op.getImmTy()] = I;
} else {
@@ -4031,37 +4660,18 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16);
}
void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) {
cvtMIMG(Inst, Operands, true);
}
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDMask);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultUNorm() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyUNorm);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDA() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDA);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultR128() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyR128);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultLWE() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyLWE);
-}
-
//===----------------------------------------------------------------------===//
// smrd
//===----------------------------------------------------------------------===//
@@ -4148,6 +4758,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"offset0", AMDGPUOperand::ImmTyOffset0, false, nullptr},
{"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
{"gds", AMDGPUOperand::ImmTyGDS, true, nullptr},
+ {"lds", AMDGPUOperand::ImmTyLDS, true, nullptr},
{"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
{"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
{"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr},
@@ -4155,6 +4766,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
{"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
+ {"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
{"high", AMDGPUOperand::ImmTyHigh, true, nullptr},
{"clamp", AMDGPUOperand::ImmTyClampSI, true, nullptr},
{"omod", AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul},
@@ -4162,6 +4774,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"da", AMDGPUOperand::ImmTyDA, true, nullptr},
{"r128", AMDGPUOperand::ImmTyR128, true, nullptr},
{"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr},
+ {"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
{"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr},
{"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
{"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
@@ -4179,6 +4792,39 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
};
OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
+ unsigned size = Operands.size();
+ assert(size > 0);
+
+ OperandMatchResultTy res = parseOptionalOpr(Operands);
+
+ // This is a hack to enable hardcoded mandatory operands which follow
+ // optional operands.
+ //
+ // Current design assumes that all operands after the first optional operand
+ // are also optional. However implementation of some instructions violates
+ // this rule (see e.g. flat/global atomic which have hardcoded 'glc' operands).
+ //
+ // To alleviate this problem, we have to (implicitly) parse extra operands
+ // to make sure autogenerated parser of custom operands never hit hardcoded
+ // mandatory operands.
+
+ if (size == 1 || ((AMDGPUOperand &)*Operands[size - 1]).isRegKind()) {
+
+ // We have parsed the first optional operand.
+ // Parse as many operands as necessary to skip all mandatory operands.
+
+ for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) {
+ if (res != MatchOperand_Success ||
+ getLexer().is(AsmToken::EndOfStatement)) break;
+ if (getLexer().is(AsmToken::Comma)) Parser.Lex();
+ res = parseOptionalOpr(Operands);
+ }
+ }
+
+ return res;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands) {
OperandMatchResultTy res;
for (const OptionalOperand &Op : AMDGPUOptionalOperandTable) {
// try to parse any optional operand here
@@ -4341,12 +4987,14 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
}
- // special case v_mac_{f16, f32}:
+ // Special case v_mac_{f16, f32} and v_fmac_f32 (gfx906):
// it has src2 register operand that is tied to dst operand
// we don't allow modifiers for this operand in assembler so src2_modifiers
- // should be 0
- if (Opc == AMDGPU::V_MAC_F32_e64_si || Opc == AMDGPU::V_MAC_F32_e64_vi ||
- Opc == AMDGPU::V_MAC_F16_e64_vi) {
+ // should be 0.
+ if (Opc == AMDGPU::V_MAC_F32_e64_si ||
+ Opc == AMDGPU::V_MAC_F32_e64_vi ||
+ Opc == AMDGPU::V_MAC_F16_e64_vi ||
+ Opc == AMDGPU::V_FMAC_F32_e64_vi) {
auto it = Inst.begin();
std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers));
it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
@@ -4448,21 +5096,23 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst,
//===----------------------------------------------------------------------===//
bool AMDGPUOperand::isDPPCtrl() const {
+ using namespace AMDGPU::DPP;
+
bool result = isImm() && getImmTy() == ImmTyDppCtrl && isUInt<9>(getImm());
if (result) {
int64_t Imm = getImm();
- return ((Imm >= 0x000) && (Imm <= 0x0ff)) ||
- ((Imm >= 0x101) && (Imm <= 0x10f)) ||
- ((Imm >= 0x111) && (Imm <= 0x11f)) ||
- ((Imm >= 0x121) && (Imm <= 0x12f)) ||
- (Imm == 0x130) ||
- (Imm == 0x134) ||
- (Imm == 0x138) ||
- (Imm == 0x13c) ||
- (Imm == 0x140) ||
- (Imm == 0x141) ||
- (Imm == 0x142) ||
- (Imm == 0x143);
+ return (Imm >= DppCtrl::QUAD_PERM_FIRST && Imm <= DppCtrl::QUAD_PERM_LAST) ||
+ (Imm >= DppCtrl::ROW_SHL_FIRST && Imm <= DppCtrl::ROW_SHL_LAST) ||
+ (Imm >= DppCtrl::ROW_SHR_FIRST && Imm <= DppCtrl::ROW_SHR_LAST) ||
+ (Imm >= DppCtrl::ROW_ROR_FIRST && Imm <= DppCtrl::ROW_ROR_LAST) ||
+ (Imm == DppCtrl::WAVE_SHL1) ||
+ (Imm == DppCtrl::WAVE_ROL1) ||
+ (Imm == DppCtrl::WAVE_SHR1) ||
+ (Imm == DppCtrl::WAVE_ROR1) ||
+ (Imm == DppCtrl::ROW_MIRROR) ||
+ (Imm == DppCtrl::ROW_HALF_MIRROR) ||
+ (Imm == DppCtrl::BCAST15) ||
+ (Imm == DppCtrl::BCAST31);
}
return false;
}
@@ -4481,6 +5131,8 @@ bool AMDGPUOperand::isU16Imm() const {
OperandMatchResultTy
AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
+ using namespace AMDGPU::DPP;
+
SMLoc S = Parser.getTok().getLoc();
StringRef Prefix;
int64_t Int;
@@ -4492,10 +5144,10 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
}
if (Prefix == "row_mirror") {
- Int = 0x140;
+ Int = DppCtrl::ROW_MIRROR;
Parser.Lex();
} else if (Prefix == "row_half_mirror") {
- Int = 0x141;
+ Int = DppCtrl::ROW_HALF_MIRROR;
Parser.Lex();
} else {
// Check to prevent parseDPPCtrlOps from eating invalid tokens
@@ -4547,24 +5199,24 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
return MatchOperand_ParseFail;
if (Prefix == "row_shl" && 1 <= Int && Int <= 15) {
- Int |= 0x100;
+ Int |= DppCtrl::ROW_SHL0;
} else if (Prefix == "row_shr" && 1 <= Int && Int <= 15) {
- Int |= 0x110;
+ Int |= DppCtrl::ROW_SHR0;
} else if (Prefix == "row_ror" && 1 <= Int && Int <= 15) {
- Int |= 0x120;
+ Int |= DppCtrl::ROW_ROR0;
} else if (Prefix == "wave_shl" && 1 == Int) {
- Int = 0x130;
+ Int = DppCtrl::WAVE_SHL1;
} else if (Prefix == "wave_rol" && 1 == Int) {
- Int = 0x134;
+ Int = DppCtrl::WAVE_ROL1;
} else if (Prefix == "wave_shr" && 1 == Int) {
- Int = 0x138;
+ Int = DppCtrl::WAVE_SHR1;
} else if (Prefix == "wave_ror" && 1 == Int) {
- Int = 0x13C;
+ Int = DppCtrl::WAVE_ROR1;
} else if (Prefix == "row_bcast") {
if (Int == 15) {
- Int = 0x142;
+ Int = DppCtrl::BCAST15;
} else if (Int == 31) {
- Int = 0x143;
+ Int = DppCtrl::BCAST31;
} else {
return MatchOperand_ParseFail;
}
@@ -4742,7 +5394,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
}
}
if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
- Op.addRegWithInputModsOperands(Inst, 2);
+ Op.addRegOrImmWithInputModsOperands(Inst, 2);
} else if (Op.isImm()) {
// Handle optional arguments
OptionalIdx[Op.getImmTy()] = I;
@@ -4824,6 +5476,8 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
return Operand.isAddr64() ? Match_Success : Match_InvalidOperand;
case MCK_gds:
return Operand.isGDS() ? Match_Success : Match_InvalidOperand;
+ case MCK_lds:
+ return Operand.isLDS() ? Match_Success : Match_InvalidOperand;
case MCK_glc:
return Operand.isGLC() ? Match_Success : Match_InvalidOperand;
case MCK_idxen:
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 2230457b3a9b..b87c47a6b9ee 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -52,14 +52,19 @@ class getAddrName<int addrKind> {
"")))));
}
-class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
+class MUBUFAddr64Table <bit is_addr64, string Name> {
bit IsAddr64 = is_addr64;
- string OpName = NAME # suffix;
+ string OpName = Name;
}
-class MTBUFAddr64Table <bit is_addr64, string suffix = ""> {
+class MUBUFLdsTable <bit is_lds, string Name> {
+ bit IsLds = is_lds;
+ string OpName = Name;
+}
+
+class MTBUFAddr64Table <bit is_addr64, string Name> {
bit IsAddr64 = is_addr64;
- string OpName = NAME # suffix;
+ string OpName = Name;
}
//===----------------------------------------------------------------------===//
@@ -137,17 +142,17 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe),
+ offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe)
+ offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe)
);
dag InsData = !if(!empty(vaddrList),
(ins vdataClass:$vdata, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
- slc:$slc, tfe:$tfe),
+ SLC:$slc, TFE:$tfe),
(ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
- slc:$slc, tfe:$tfe)
+ SLC:$slc, TFE:$tfe)
);
dag ret = !if(!empty(vdataList), InsNoData, InsData);
}
@@ -214,13 +219,13 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
[(set load_vt:$vdata,
(ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
- MTBUFAddr64Table<0>;
+ MTBUFAddr64Table<0, NAME>;
def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(set load_vt:$vdata,
(ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
- MTBUFAddr64Table<1>;
+ MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -260,13 +265,13 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
i1:$slc, i1:$tfe))]>,
- MTBUFAddr64Table<0>;
+ MTBUFAddr64Table<0, NAME>;
def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
i1:$slc, i1:$tfe))]>,
- MTBUFAddr64Table<1>;
+ MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -310,6 +315,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> offen = 0;
bits<1> idxen = 0;
bits<1> addr64 = 0;
+ bits<1> lds = 0;
bits<1> has_vdata = 1;
bits<1> has_vaddr = 1;
bits<1> has_glc = 1;
@@ -336,7 +342,6 @@ class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
bits<12> offset;
bits<1> glc;
- bits<1> lds = 0;
bits<8> vaddr;
bits<8> vdata;
bits<7> srsrc;
@@ -371,31 +376,35 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node> :
}
class getMUBUFInsDA<list<RegisterClass> vdataList,
- list<RegisterClass> vaddrList=[]> {
+ list<RegisterClass> vaddrList=[],
+ bit isLds = 0> {
RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+ offset:$offset, GLC:$glc, SLC:$slc),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+ offset:$offset, GLC:$glc, SLC:$slc)
);
dag InsData = !if(!empty(vaddrList),
(ins vdataClass:$vdata, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+ SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
(ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+ SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc)
);
- dag ret = !if(!empty(vdataList), InsNoData, InsData);
+ dag ret = !con(
+ !if(!empty(vdataList), InsNoData, InsData),
+ !if(isLds, (ins), (ins TFE:$tfe))
+ );
}
-class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> {
dag ret =
- !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList>.ret,
- !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
- !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
- !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
- !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64], isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64], isLds>.ret,
(ins))))));
}
@@ -426,20 +435,29 @@ class MUBUF_Load_Pseudo <string opName,
int addrKind,
RegisterClass vdataClass,
bit HasTiedDest = 0,
+ bit isLds = 0,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind>
: MUBUF_Pseudo<opName,
(outs vdataClass:$vdata),
- !con(getMUBUFIns<addrKindCopy>.ret, !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ !con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
+ !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
+ !if(isLds, " lds", "$tfe"),
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
- let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+ let PseudoInstr = opName # !if(isLds, "_lds", "") #
+ "_" # getAddrName<addrKindCopy>.ret;
+ let AsmMatchConverter = !if(isLds, "cvtMubufLds", "cvtMubuf");
+
let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
let mayLoad = 1;
let mayStore = 0;
let maybeAtomic = 1;
+ let Uses = !if(isLds, [EXEC, M0], [EXEC]);
+ let has_tfe = !if(isLds, 0, 1);
+ let lds = isLds;
}
// FIXME: tfe can't be an operand because it requires a separate
@@ -447,32 +465,45 @@ class MUBUF_Load_Pseudo <string opName,
multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
ValueType load_vt = i32,
SDPatternOperator ld = null_frag,
- bit TiedDest = 0> {
+ bit TiedDest = 0,
+ bit isLds = 0> {
def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
- TiedDest,
- [(set load_vt:$vdata,
- (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
- MUBUFAddr64Table<0>;
+ TiedDest, isLds,
+ !if(isLds,
+ [],
+ [(set load_vt:$vdata,
+ (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>,
+ MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
- TiedDest,
- [(set load_vt:$vdata,
- (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
- MUBUFAddr64Table<1>;
+ TiedDest, isLds,
+ !if(isLds,
+ [],
+ [(set load_vt:$vdata,
+ (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>,
+ MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>;
- def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>;
- def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>;
- def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>;
+ def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
+ def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
+ def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest>;
- def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>;
- def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>;
- def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>;
+ def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>;
+ def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
+ def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
+ def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
}
}
+multiclass MUBUF_Pseudo_Loads_Lds<string opName, RegisterClass vdataClass,
+ ValueType load_vt = i32,
+ SDPatternOperator ld_nolds = null_frag,
+ SDPatternOperator ld_lds = null_frag> {
+ defm NAME : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_nolds>;
+ defm _LDS : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_lds, 0, 1>;
+}
+
class MUBUF_Store_Pseudo <string opName,
int addrKind,
RegisterClass vdataClass,
@@ -499,12 +530,12 @@ multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
- MUBUFAddr64Table<0>;
+ MUBUFAddr64Table<0, NAME>;
def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
- MUBUFAddr64Table<1>;
+ MUBUFAddr64Table<1, NAME>;
def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -518,6 +549,23 @@ multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
}
}
+class MUBUF_Pseudo_Store_Lds<string opName>
+ : MUBUF_Pseudo<opName,
+ (outs),
+ (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
+ " $srsrc, $soffset$offset lds$glc$slc"> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let maybeAtomic = 1;
+
+ let has_vdata = 0;
+ let has_vaddr = 0;
+ let has_tfe = 0;
+ let lds = 1;
+
+ let Uses = [EXEC, M0];
+ let AsmMatchConverter = "cvtMubufLds";
+}
class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
list<RegisterClass> vaddrList=[]> {
@@ -525,15 +573,15 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
dag ret = !if(vdata_in,
!if(!empty(vaddrList),
(ins vdataClass:$vdata_in,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc),
(ins vdataClass:$vdata_in, vaddrClass:$vaddr,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc)
),
!if(!empty(vaddrList),
(ins vdataClass:$vdata,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc),
(ins vdataClass:$vdata, vaddrClass:$vaddr,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc)
));
}
@@ -618,9 +666,9 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
SDPatternOperator atomic> {
def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
- MUBUFAddr64Table <0>;
+ MUBUFAddr64Table <0, NAME>;
def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
- MUBUFAddr64Table <1>;
+ MUBUFAddr64Table <1, NAME>;
def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
@@ -629,13 +677,13 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
[(set vdataType:$vdata,
(atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
vdataType:$vdata_in))]>,
- MUBUFAddr64Table <0, "_RTN">;
+ MUBUFAddr64Table <0, NAME # "_RTN">;
def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(set vdataType:$vdata,
(atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc),
vdataType:$vdata_in))]>,
- MUBUFAddr64Table <1, "_RTN">;
+ MUBUFAddr64Table <1, NAME # "_RTN">;
def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -647,7 +695,7 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
// MUBUF Instructions
//===----------------------------------------------------------------------===//
-defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads_Lds <
"buffer_load_format_x", VGPR_32
>;
defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads <
@@ -671,19 +719,74 @@ defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores <
defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores <
"buffer_store_format_xyzw", VReg_128
>;
-defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads <
+
+let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
+ defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_x", VGPR_32
+ >;
+ defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_xy", VReg_64
+ >;
+ defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_xyz", VReg_96
+ >;
+ defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_xyzw", VReg_128
+ >;
+ defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_x", VGPR_32
+ >;
+ defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_xy", VReg_64
+ >;
+ defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_xyz", VReg_96
+ >;
+ defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_xyzw", VReg_128
+ >;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
+ defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_x", VGPR_32
+ >;
+ defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_xy", VGPR_32
+ >;
+ defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_xyz", VReg_64
+ >;
+ defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_xyzw", VReg_64
+ >;
+ defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_x", VGPR_32
+ >;
+ defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_xy", VGPR_32
+ >;
+ defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_xyz", VReg_64
+ >;
+ defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_xyzw", VReg_64
+ >;
+} // End HasPackedD16VMem.
+
+defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds <
"buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8
>;
-defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds <
"buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8
>;
-defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds <
"buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16
>;
-defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds <
"buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16
>;
-defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds <
"buffer_load_dword", VGPR_32, i32, mubuf_load
>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
@@ -695,6 +798,22 @@ defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
"buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
>;
+
+// This is not described in AMD documentation,
+// but 'lds' versions of these opcodes are available
+// in at least GFX8+ chips. See Bug 37653.
+let SubtargetPredicate = isVI in {
+defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads <
+ "buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1
+>;
+defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads <
+ "buffer_load_dwordx3", VReg_96, untyped, null_frag, 0, 1
+>;
+defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads <
+ "buffer_load_dwordx4", VReg_128, v4i32, null_frag, 0, 1
+>;
+}
+
defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
"buffer_store_byte", VGPR_32, i32, truncstorei8_global
>;
@@ -792,6 +911,10 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
"buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
>;
+let SubtargetPredicate = isVI in {
+def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">;
+}
+
let SubtargetPredicate = isSI in { // isn't on CI & VI
/*
defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">;
@@ -842,6 +965,13 @@ defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
"buffer_store_short_d16_hi", VGPR_32, i32
>;
+defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads <
+ "buffer_load_format_d16_hi_x", VGPR_32
+>;
+defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
+ "buffer_store_format_d16_hi_x", VGPR_32
+>;
+
} // End HasD16LoadStore
def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
@@ -860,6 +990,28 @@ defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",
defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>;
defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
+let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
+ defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128>;
+ defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>;
+ defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
+ defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>;
+ defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>;
+ defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>;
+} // End HasPackedD16VMem.
+
let SubtargetPredicate = isCIVI in {
//===----------------------------------------------------------------------===//
@@ -922,6 +1074,19 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+
+let SubtargetPredicate = HasUnpackedD16VMem in {
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
@@ -969,6 +1134,19 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+
+let SubtargetPredicate = HasUnpackedD16VMem in {
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
@@ -1210,7 +1388,7 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
@@ -1325,7 +1503,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OF
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
// Hiding the extract high pattern in the PatFrag seems to not
// automatically increase the complexity.
let AddedComplexity = 1 in {
@@ -1382,6 +1560,18 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32, "TBUFFER_LOAD_FORMAT_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
+let SubtargetPredicate = HasUnpackedD16VMem in {
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
@@ -1431,6 +1621,18 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY"
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
+let SubtargetPredicate = HasUnpackedD16VMem in {
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
//===----------------------------------------------------------------------===//
// Target instructions, move to the appropriate target TD file
//===----------------------------------------------------------------------===//
@@ -1451,7 +1653,7 @@ class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> :
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
let Inst{15} = ps.addr64;
- let Inst{16} = lds;
+ let Inst{16} = !if(ps.lds, 1, 0);
let Inst{24-18} = op;
let Inst{31-26} = 0x38; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
@@ -1470,6 +1672,31 @@ multiclass MUBUF_Real_AllAddr_si<bits<7> op> {
def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
}
+multiclass MUBUF_Real_AllAddr_Lds_si<bits<7> op> {
+
+ def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ MUBUFLdsTable<0, NAME # "_OFFSET_si">;
+ def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>,
+ MUBUFLdsTable<0, NAME # "_ADDR64_si">;
+ def _OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ MUBUFLdsTable<0, NAME # "_OFFEN_si">;
+ def _IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ MUBUFLdsTable<0, NAME # "_IDXEN_si">;
+ def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ MUBUFLdsTable<0, NAME # "_BOTHEN_si">;
+
+ def _LDS_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+ MUBUFLdsTable<1, NAME # "_OFFSET_si">;
+ def _LDS_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>,
+ MUBUFLdsTable<1, NAME # "_ADDR64_si">;
+ def _LDS_OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+ MUBUFLdsTable<1, NAME # "_OFFEN_si">;
+ def _LDS_IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+ MUBUFLdsTable<1, NAME # "_IDXEN_si">;
+ def _LDS_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+ MUBUFLdsTable<1, NAME # "_BOTHEN_si">;
+}
+
multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
def _OFFSET_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
def _ADDR64_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>;
@@ -1478,7 +1705,7 @@ multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
def _BOTHEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
}
-defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_si <0x00>;
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_si <0x00>;
defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_si <0x01>;
defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x02>;
defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x03>;
@@ -1486,11 +1713,11 @@ defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_si <0x04>;
defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_si <0x05>;
defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x06>;
defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x07>;
-defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_si <0x08>;
-defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_si <0x09>;
-defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_si <0x0a>;
-defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_si <0x0b>;
-defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_si <0x0c>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_si <0x08>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_si <0x09>;
+defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_si <0x0a>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_si <0x0b>;
+defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_si <0x0c>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_si <0x0d>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_si <0x0e>;
defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_si <0x0f>;
@@ -1575,7 +1802,7 @@ multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_si <0>;
defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_si <1>;
-//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>;
defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_si <3>;
defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_si <4>;
defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_si <5>;
@@ -1610,7 +1837,7 @@ class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
- let Inst{16} = lds;
+ let Inst{16} = !if(ps.lds, 1, 0);
let Inst{17} = !if(ps.has_slc, slc, ?);
let Inst{24-18} = op;
let Inst{31-26} = 0x38; //encoding
@@ -1628,6 +1855,56 @@ multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
}
+multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> {
+
+ def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ MUBUFLdsTable<0, NAME # "_OFFSET_vi">;
+ def _OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ MUBUFLdsTable<0, NAME # "_OFFEN_vi">;
+ def _IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ MUBUFLdsTable<0, NAME # "_IDXEN_vi">;
+ def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ MUBUFLdsTable<0, NAME # "_BOTHEN_vi">;
+
+ def _LDS_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+ MUBUFLdsTable<1, NAME # "_OFFSET_vi">;
+ def _LDS_OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+ MUBUFLdsTable<1, NAME # "_OFFEN_vi">;
+ def _LDS_IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+ MUBUFLdsTable<1, NAME # "_IDXEN_vi">;
+ def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+ MUBUFLdsTable<1, NAME # "_BOTHEN_vi">;
+}
+
+class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
+ MUBUF_Real<op, ps>,
+ Enc64,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX80> {
+ let AssemblerPredicate=HasUnpackedD16VMem;
+ let DecoderNamespace="GFX80_UNPACKED";
+
+ let Inst{11-0} = !if(ps.has_offset, offset, ?);
+ let Inst{12} = ps.offen;
+ let Inst{13} = ps.idxen;
+ let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{16} = !if(ps.lds, 1, 0);
+ let Inst{17} = !if(ps.has_slc, slc, ?);
+ let Inst{24-18} = op;
+ let Inst{31-26} = 0x38; //encoding
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MUBUF_Real_AllAddr_gfx80<bits<7> op> {
+ def _OFFSET_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _OFFEN_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
MUBUF_Real_AllAddr_vi<op> {
def _OFFSET_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
@@ -1636,7 +1913,7 @@ multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
def _BOTHEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
}
-defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_vi <0x00>;
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_vi <0x00>;
defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x01>;
defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x02>;
defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x03>;
@@ -1644,14 +1921,34 @@ defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_vi <0x04>;
defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x05>;
defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x06>;
defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x07>;
-defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_vi <0x10>;
-defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_vi <0x11>;
-defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_vi <0x12>;
-defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_vi <0x13>;
-defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_vi <0x14>;
-defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_vi <0x15>;
-defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_vi <0x16>;
-defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_vi <0x17>;
+let SubtargetPredicate = HasUnpackedD16VMem in {
+ defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x08>;
+ defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x09>;
+ defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0a>;
+ defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0b>;
+ defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0c>;
+ defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0d>;
+ defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0e>;
+ defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0f>;
+} // End HasUnpackedD16VMem.
+let SubtargetPredicate = HasPackedD16VMem in {
+ defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x08>;
+ defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x09>;
+ defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0a>;
+ defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0b>;
+ defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x0c>;
+ defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x0d>;
+ defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0e>;
+ defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0f>;
+} // End HasPackedD16VMem.
+defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_vi <0x10>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_vi <0x11>;
+defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_vi <0x12>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_vi <0x13>;
+defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_vi <0x14>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_Lds_vi <0x15>;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_Lds_vi <0x16>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_Lds_vi <0x17>;
defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_vi <0x18>;
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x19>;
defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_vi <0x1a>;
@@ -1668,6 +1965,9 @@ defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x23>;
defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_vi <0x24>;
defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_vi <0x25>;
+defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_vi <0x26>;
+defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_vi <0x27>;
+
defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_vi <0x40>;
defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_vi <0x41>;
defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_vi <0x42>;
@@ -1696,6 +1996,8 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_vi <0x6a>;
defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_vi <0x6b>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_vi <0x6c>;
+def BUFFER_STORE_LDS_DWORD_vi : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>;
+
def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
@@ -1729,11 +2031,61 @@ multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
}
-defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0>;
-defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <1>;
-//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <2>;
-defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <3>;
-defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <4>;
-defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <5>;
-defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <6>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>;
+class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> :
+ MTBUF_Real<ps>,
+ Enc64,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX80> {
+ let AssemblerPredicate=HasUnpackedD16VMem;
+ let DecoderNamespace="GFX80_UNPACKED";
+
+ let Inst{11-0} = !if(ps.has_offset, offset, ?);
+ let Inst{12} = ps.offen;
+ let Inst{13} = ps.idxen;
+ let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{18-15} = op;
+ let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+ let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+ let Inst{31-26} = 0x3a; //encoding
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MTBUF_Real_AllAddr_gfx80<bits<4> op> {
+ def _OFFSET_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _OFFEN_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0x00>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <0x01>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <0x02>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x03>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <0x04>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <0x05>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <0x06>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x07>;
+let SubtargetPredicate = HasUnpackedD16VMem in {
+ defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x08>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x09>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0a>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0b>;
+ defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0c>;
+ defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0d>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0e>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0f>;
+} // End HasUnpackedD16VMem.
+let SubtargetPredicate = HasPackedD16VMem in {
+ defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <0x08>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <0x09>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0a>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0b>;
+ defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <0x0c>;
+ defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <0x0d>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0e>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>;
+} // End HasUnpackedD16VMem.
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 3a8503030414..174b2df15300 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -1,18 +1,33 @@
set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
-tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
-tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic)
-tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
-tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer)
-tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM AMDGPUGenIntrinsicEnums.inc -gen-tgt-intrinsic-enums)
+tablegen(LLVM AMDGPUGenIntrinsicImpl.inc -gen-tgt-intrinsic-impl)
+tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering)
tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank)
+tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AMDGPUGenSearchableTables.inc -gen-searchable-tables)
+tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
+
+set(LLVM_TARGET_DEFINITIONS AMDGPUGISel.td)
+tablegen(LLVM AMDGPUGenGlobalISel.inc -gen-global-isel)
+
+set(LLVM_TARGET_DEFINITIONS R600.td)
+tablegen(LLVM R600GenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM R600GenCallingConv.inc -gen-callingconv)
+tablegen(LLVM R600GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM R600GenDFAPacketizer.inc -gen-dfa-packetizer)
+tablegen(LLVM R600GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM R600GenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM R600GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM R600GenSubtargetInfo.inc -gen-subtarget)
+
add_public_tablegen_target(AMDGPUCommonTableGen)
add_llvm_target(AMDGPUCodeGen
@@ -25,6 +40,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUCallLowering.cpp
AMDGPUCodeGenPrepare.cpp
AMDGPUFrameLowering.cpp
+ AMDGPUHSAMetadataStreamer.cpp
AMDGPUInstrInfo.cpp
AMDGPUInstructionSelector.cpp
AMDGPUIntrinsicInfo.cpp
@@ -34,13 +50,14 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULibCalls.cpp
AMDGPULibFunc.cpp
AMDGPULowerIntrinsics.cpp
+ AMDGPULowerKernelArguments.cpp
+ AMDGPULowerKernelAttributes.cpp
AMDGPUMachineCFGStructurizer.cpp
AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp
AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
- AMDGPUOpenCLImageTypeLoweringPass.cpp
AMDGPUPromoteAlloca.cpp
AMDGPURegAsmNames.inc.cpp
AMDGPURegisterBankInfo.cpp
@@ -53,12 +70,14 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUUnifyDivergentExitNodes.cpp
AMDGPUUnifyMetadata.cpp
AMDGPUInline.cpp
+ AMDGPUPerfHintAnalysis.cpp
AMDILCFGStructurizer.cpp
GCNHazardRecognizer.cpp
GCNIterativeScheduler.cpp
GCNMinRegStrategy.cpp
GCNRegPressure.cpp
GCNSchedStrategy.cpp
+ R600AsmPrinter.cpp
R600ClauseMergePass.cpp
R600ControlFlowFinalizer.cpp
R600EmitClauseMarkers.cpp
@@ -68,6 +87,7 @@ add_llvm_target(AMDGPUCodeGen
R600ISelLowering.cpp
R600MachineFunctionInfo.cpp
R600MachineScheduler.cpp
+ R600OpenCLImageTypeLoweringPass.cpp
R600OptimizeVectorRegisters.cpp
R600Packetizer.cpp
R600RegisterInfo.cpp
@@ -77,10 +97,10 @@ add_llvm_target(AMDGPUCodeGen
SIFixVGPRCopies.cpp
SIFixWWMLiveness.cpp
SIFoldOperands.cpp
+ SIFormMemoryClauses.cpp
SIFrameLowering.cpp
SIInsertSkips.cpp
SIInsertWaitcnts.cpp
- SIInsertWaits.cpp
SIInstrInfo.cpp
SIISelLowering.cpp
SILoadStoreOptimizer.cpp
@@ -99,8 +119,8 @@ add_llvm_target(AMDGPUCodeGen
)
add_subdirectory(AsmParser)
-add_subdirectory(InstPrinter)
add_subdirectory(Disassembler)
-add_subdirectory(TargetInfo)
+add_subdirectory(InstPrinter)
add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
add_subdirectory(Utils)
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index f898fd7948cc..cdc6ab9412e6 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -440,7 +440,7 @@ defm DS_XOR_RTN_B32 : DS_1A1D_RET_mc<"ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
-defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc <"ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
+defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc<"ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
defm DS_MAX_RTN_F32 : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">;
@@ -584,6 +584,8 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
int_amdgcn_ds_bpermute>;
}
+def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
+
} // let SubtargetPredicate = isVI
//===----------------------------------------------------------------------===//
@@ -600,8 +602,6 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
(inst $ptr, (as_i16imm $offset), (i1 0))
>;
-// FIXME: Passing name of PatFrag in workaround. Why doesn't
-// !cast<PatFrag>(frag.NAME#"_m0") work!?
multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
@@ -609,7 +609,7 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
- def : DSReadPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
}
}
@@ -647,14 +647,17 @@ defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "az_extloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">;
+defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
+defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
let AddedComplexity = 100 in {
defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">;
+defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
} // End AddedComplexity = 100
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
let AddedComplexity = 100 in {
defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
@@ -678,7 +681,24 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
- def : DSWritePat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+}
+
+// Irritatingly, atomic_store reverses the order of operands from a
+// normal store.
+class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
+ (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+>;
+
+multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
}
}
@@ -687,8 +707,10 @@ defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local">;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_local_hi16>;
}
@@ -720,6 +742,8 @@ def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
let AddedComplexity = 100 in {
defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">;
+defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">;
+
} // End AddedComplexity = 100
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
@@ -732,7 +756,8 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
- def : DSAtomicRetPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag)>;
}
}
@@ -749,7 +774,8 @@ multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
- def : DSAtomicCmpXChg<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag)>;
}
}
@@ -769,6 +795,9 @@ defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">;
defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">;
defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">;
defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin_local">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax_local">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd_local">;
// 64-bit atomics.
defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">;
@@ -1123,6 +1152,7 @@ def DS_XOR_SRC2_B32_vi : DS_Real_vi<0x8b, DS_XOR_SRC2_B32>;
def DS_WRITE_SRC2_B32_vi : DS_Real_vi<0x8d, DS_WRITE_SRC2_B32>;
def DS_MIN_SRC2_F32_vi : DS_Real_vi<0x92, DS_MIN_SRC2_F32>;
def DS_MAX_SRC2_F32_vi : DS_Real_vi<0x93, DS_MAX_SRC2_F32>;
+def DS_ADD_SRC2_F32_vi : DS_Real_vi<0x95, DS_ADD_SRC2_F32>;
def DS_ADD_SRC2_U64_vi : DS_Real_vi<0xc0, DS_ADD_SRC2_U64>;
def DS_SUB_SRC2_U64_vi : DS_Real_vi<0xc1, DS_SUB_SRC2_U64>;
def DS_RSUB_SRC2_U64_vi : DS_Real_vi<0xc2, DS_RSUB_SRC2_U64>;
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 47a2d3f2fdc5..f3de903f21b2 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -20,7 +20,9 @@
#include "Disassembler/AMDGPUDisassembler.h"
#include "AMDGPU.h"
#include "AMDGPURegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm-c/Disassembler.h"
#include "llvm/ADT/APInt.h"
@@ -198,6 +200,21 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
if (Res) { IsSDWA = true; break; }
+
+ if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) {
+ Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address);
+ if (Res)
+ break;
+ }
+
+ // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
+ // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
+ // table first so we print the correct name.
+ if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) {
+ Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address);
+ if (Res)
+ break;
+ }
}
// Reinitialize Bytes as DPP64 could have eaten too much
@@ -228,7 +245,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
- MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) {
+ MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi)) {
// Insert dummy unused src2_modifiers.
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src2_modifiers);
@@ -241,7 +259,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Res && IsSDWA)
Res = convertSDWAInst(MI);
- Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
+ // if the opcode was not recognized we'll assume a Size of 4 bytes
+ // (unless there are fewer bytes left)
+ Size = Res ? (MaxInstBytesNum - Bytes.size())
+ : std::min((size_t)4, Bytes_.size());
return Res;
}
@@ -264,26 +285,70 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
return MCDisassembler::Success;
}
+// Note that MIMG format provides no information about VADDR size.
+// Consequently, decoded instructions always show address
+// as if it has 1 dword, which could be not really so.
DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
+
+ int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vdst);
+
int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vdata);
int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::dmask);
+
+ int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::tfe);
+ int D16Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::d16);
+
+ assert(VDataIdx != -1);
+ assert(DMaskIdx != -1);
+ assert(TFEIdx != -1);
+
+ bool IsAtomic = (VDstIdx != -1);
+ bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4;
+
unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
if (DMask == 0)
return MCDisassembler::Success;
- unsigned ChannelCount = countPopulation(DMask);
- if (ChannelCount == 1)
+ unsigned DstSize = IsGather4 ? 4 : countPopulation(DMask);
+ if (DstSize == 1)
return MCDisassembler::Success;
- int NewOpcode = AMDGPU::getMaskedMIMGOp(*MCII, MI.getOpcode(), ChannelCount);
- assert(NewOpcode != -1 && "could not find matching mimg channel instruction");
+ bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm();
+ if (D16 && AMDGPU::hasPackedD16(STI)) {
+ DstSize = (DstSize + 1) / 2;
+ }
+
+ // FIXME: Add tfe support
+ if (MI.getOperand(TFEIdx).getImm())
+ return MCDisassembler::Success;
+
+ int NewOpcode = -1;
+
+ if (IsGather4) {
+ if (D16 && AMDGPU::hasPackedD16(STI))
+ NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), 2);
+ else
+ return MCDisassembler::Success;
+ } else {
+ NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), DstSize);
+ if (NewOpcode == -1)
+ return MCDisassembler::Success;
+ }
+
auto RCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass;
- // Widen the register to the correct number of enabled channels.
+ // Get first subregister of VData
unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
+ unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
+ Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
+
+ // Widen the register to the correct number of enabled channels.
auto NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
&MRI.getRegClass(RCID));
if (NewVdata == AMDGPU::NoRegister) {
@@ -297,6 +362,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
// how it is usually emitted because the number of register components is not
// in the instruction encoding.
MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
+
+ if (IsAtomic) {
+ // Atomic operations have an additional operand (a copy of data)
+ MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
+ }
+
return MCDisassembler::Success;
}
@@ -690,9 +761,8 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
switch (Val) {
case 102: return createRegOperand(FLAT_SCR_LO);
case 103: return createRegOperand(FLAT_SCR_HI);
- // ToDo: no support for xnack_mask_lo/_hi register
- case 104:
- case 105: break;
+ case 104: return createRegOperand(XNACK_MASK_LO);
+ case 105: return createRegOperand(XNACK_MASK_HI);
case 106: return createRegOperand(VCC_LO);
case 107: return createRegOperand(VCC_HI);
case 108: assert(!isGFX9()); return createRegOperand(TBA_LO);
@@ -722,6 +792,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
switch (Val) {
case 102: return createRegOperand(FLAT_SCR);
+ case 104: return createRegOperand(XNACK_MASK);
case 106: return createRegOperand(VCC);
case 108: assert(!isGFX9()); return createRegOperand(TBA);
case 110: assert(!isGFX9()); return createRegOperand(TMA);
@@ -732,8 +803,9 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
}
MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
- unsigned Val) const {
+ const unsigned Val) const {
using namespace AMDGPU::SDWA;
+ using namespace AMDGPU::EncValues;
if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
// XXX: static_cast<int> is needed to avoid stupid warning:
@@ -754,7 +826,15 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
Val - SDWA9EncValues::SRC_TTMP_MIN);
}
- return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+ const unsigned SVal = Val - SDWA9EncValues::SRC_SGPR_MIN;
+
+ if (INLINE_INTEGER_C_MIN <= SVal && SVal <= INLINE_INTEGER_C_MAX)
+ return decodeIntImmed(SVal);
+
+ if (INLINE_FLOATING_C_MIN <= SVal && SVal <= INLINE_FLOATING_C_MAX)
+ return decodeFPImmed(Width, SVal);
+
+ return decodeSpecialReg32(SVal);
} else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
return createRegOperand(getVgprClassId(Width), Val);
}
@@ -815,6 +895,9 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
}
auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo);
+ if (!Symbols)
+ return false;
+
auto Result = std::find_if(Symbols->begin(), Symbols->end(),
[Value](const SymbolInfoTy& Val) {
return std::get<0>(Val) == static_cast<uint64_t>(Value)
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 5e26f97b0c86..944f4ffe598d 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -15,7 +15,6 @@
def isEG : Predicate<
"Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
- "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && "
"!Subtarget->hasCaymanISA()"
>;
@@ -693,7 +692,7 @@ def : EGOrCaymanPat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
def : EGOrCaymanPat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
// SHA-256 Patterns
-def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
+defm : SHA256MaPattern <BFI_INT_eg, XOR_INT, R600_Reg64>;
def EG_ExportSwz : ExportSwzInst {
let Word1{19-16} = 0; // BURST_COUNT
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 693869128081..3ef473b7fd96 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -135,7 +135,7 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
!con((ins VReg_64:$vaddr),
!if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
(ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
- (ins GLC:$glc, slc:$slc)),
+ (ins GLC:$glc, SLC:$slc)),
!if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
" $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
let has_data = 0;
@@ -158,7 +158,7 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
!con((ins VReg_64:$vaddr, vdataClass:$vdata),
!if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
(ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
- (ins GLC:$glc, slc:$slc)),
+ (ins GLC:$glc, SLC:$slc)),
" $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
let mayLoad = 0;
let mayStore = 1;
@@ -188,8 +188,8 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
opName,
(outs regClass:$vdst),
!if(EnableSaddr,
- (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc),
- (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)),
+ (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc),
+ (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)),
" $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc"> {
let has_data = 0;
let mayLoad = 1;
@@ -204,8 +204,8 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
opName,
(outs),
!if(EnableSaddr,
- (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc),
- (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)),
+ (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc),
+ (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)),
" "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc"> {
let mayLoad = 0;
let mayStore = 1;
@@ -260,7 +260,7 @@ multiclass FLAT_Atomic_Pseudo<
RegisterClass data_rc = vdst_rc> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
" $vaddr, $vdata$offset$slc">,
AtomicNoRet <opName, 0> {
let PseudoInstr = NAME;
@@ -268,7 +268,7 @@ multiclass FLAT_Atomic_Pseudo<
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata$offset glc$slc",
[(set vt:$vdst,
(atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
@@ -285,7 +285,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
" $vaddr, $vdata, off$offset$slc">,
AtomicNoRet <opName, 0> {
let has_saddr = 1;
@@ -294,7 +294,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata, off$offset glc$slc",
[(set vt:$vdst,
(atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
@@ -304,7 +304,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
" $vaddr, $vdata, $saddr$offset$slc">,
AtomicNoRet <opName#"_saddr", 0> {
let has_saddr = 1;
@@ -314,7 +314,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
AtomicNoRet <opName#"_saddr", 1> {
let has_saddr = 1;
@@ -780,7 +780,7 @@ def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
@@ -824,7 +824,7 @@ def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index dd515b0bf2f1..f236f10ba75a 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -16,6 +16,7 @@
#include "SIDefines.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -39,7 +40,7 @@ using namespace llvm;
GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
CurrCycleInstr(nullptr),
MF(MF),
- ST(MF.getSubtarget<SISubtarget>()),
+ ST(MF.getSubtarget<GCNSubtarget>()),
TII(*ST.getInstrInfo()),
TRI(TII.getRegisterInfo()),
ClauseUses(TRI.getNumRegUnits()),
@@ -355,13 +356,13 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
}
int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
int WaitStatesNeeded = 0;
WaitStatesNeeded = checkSoftClauseHazards(SMRD);
// This SMRD hazard only affects SI.
- if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS)
+ if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS)
return WaitStatesNeeded;
// A read of an SGPR by SMRD instruction requires 4 wait states when the
@@ -398,7 +399,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
}
int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
- if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return 0;
int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
@@ -634,7 +635,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
}
int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
- if (MI->isDebugValue())
+ if (MI->isDebugInstr())
return 0;
const SIRegisterInfo *TRI = ST.getRegisterInfo();
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h
index f9a6e395a454..ca17e7cb6018 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -28,7 +28,7 @@ class MachineRegisterInfo;
class ScheduleDAG;
class SIInstrInfo;
class SIRegisterInfo;
-class SISubtarget;
+class GCNSubtarget;
class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
// This variable stores the instruction that has been emitted this cycle. It
@@ -37,7 +37,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
MachineInstr *CurrCycleInstr;
std::list<MachineInstr*> EmittedInstrs;
const MachineFunction &MF;
- const SISubtarget &ST;
+ const GCNSubtarget &ST;
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp
index ba8211b189cf..651091d44136 100644
--- a/lib/Target/AMDGPU/GCNILPSched.cpp
+++ b/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -149,9 +149,9 @@ static int BUCompareLatency(const SUnit *left, const SUnit *right) {
int LDepth = left->getDepth();
int RDepth = right->getDepth();
if (LDepth != RDepth) {
- DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum
- << ") depth " << LDepth << " vs SU (" << right->NodeNum
- << ") depth " << RDepth << "\n");
+ LLVM_DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum
+ << ") depth " << LDepth << " vs SU (" << right->NodeNum
+ << ") depth " << RDepth << "\n");
return LDepth < RDepth ? 1 : -1;
}
if (left->Latency != right->Latency)
@@ -169,9 +169,9 @@ const SUnit *GCNILPScheduler::pickBest(const SUnit *left, const SUnit *right)
if (!DisableSchedCriticalPath) {
int spread = (int)left->getDepth() - (int)right->getDepth();
if (std::abs(spread) > MaxReorderWindow) {
- DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
- << left->getDepth() << " != SU(" << right->NodeNum << "): "
- << right->getDepth() << "\n");
+ LLVM_DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
+ << left->getDepth() << " != SU(" << right->NodeNum
+ << "): " << right->getDepth() << "\n");
return left->getDepth() < right->getDepth() ? right : left;
}
}
@@ -324,19 +324,18 @@ GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots,
if (AvailQueue.empty())
break;
- DEBUG(
- dbgs() << "\n=== Picking candidate\n"
- "Ready queue:";
- for (auto &C : AvailQueue)
- dbgs() << ' ' << C.SU->NodeNum;
- dbgs() << '\n';
- );
+ LLVM_DEBUG(dbgs() << "\n=== Picking candidate\n"
+ "Ready queue:";
+ for (auto &C
+ : AvailQueue) dbgs()
+ << ' ' << C.SU->NodeNum;
+ dbgs() << '\n';);
auto C = pickCandidate();
assert(C);
AvailQueue.remove(*C);
auto SU = C->SU;
- DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+ LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
advanceToCycle(SU->getHeight());
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index a0e4f7ff24cb..15366d66bd85 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -11,6 +11,7 @@
#include "AMDGPUSubtarget.h"
#include "GCNRegPressure.h"
#include "GCNSchedStrategy.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
@@ -19,6 +20,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -68,14 +70,14 @@ static void printRegion(raw_ostream &OS,
auto I = Begin;
MaxInstNum = std::max(MaxInstNum, 1u);
for (; I != End && MaxInstNum; ++I, --MaxInstNum) {
- if (!I->isDebugValue() && LIS)
+ if (!I->isDebugInstr() && LIS)
OS << LIS->getInstructionIndex(*I);
OS << '\t' << *I;
}
if (I != End) {
OS << "\t...\n";
I = std::prev(End);
- if (!I->isDebugValue() && LIS)
+ if (!I->isDebugInstr() && LIS)
OS << LIS->getInstructionIndex(*I);
OS << '\t' << *I;
}
@@ -106,7 +108,7 @@ static void printLivenessInfo(raw_ostream &OS,
LLVM_DUMP_METHOD
void GCNIterativeScheduler::printRegions(raw_ostream &OS) const {
- const auto &ST = MF.getSubtarget<SISubtarget>();
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
for (const auto R : Regions) {
OS << "Region to schedule ";
printRegion(OS, R->Begin, R->End, LIS, 1);
@@ -130,7 +132,7 @@ LLVM_DUMP_METHOD
void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
const GCNRegPressure &Before,
const GCNRegPressure &After) const {
- const auto &ST = MF.getSubtarget<SISubtarget>();
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
OS << "RP before: ";
Before.print(OS, &ST);
OS << "RP after: ";
@@ -199,8 +201,8 @@ public:
void schedule() {
assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
- DEBUG(dbgs() << "\nScheduling ";
- printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
+ LLVM_DEBUG(dbgs() << "\nScheduling ";
+ printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
Sch.BaseClass::schedule();
// Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
@@ -310,14 +312,13 @@ void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden
void GCNIterativeScheduler::schedule() { // overriden
// do nothing
- DEBUG(
- printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
- if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
- dbgs() << "Max RP: ";
- Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget<SISubtarget>());
- }
- dbgs() << '\n';
- );
+ LLVM_DEBUG(printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
+ if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
+ dbgs() << "Max RP: ";
+ Regions.back()->MaxPressure.print(
+ dbgs(), &MF.getSubtarget<GCNSubtarget>());
+ } dbgs()
+ << '\n';);
}
void GCNIterativeScheduler::finalizeSchedule() { // overriden
@@ -383,10 +384,10 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
if (MI != &*Top) {
BB->remove(MI);
BB->insert(Top, MI);
- if (!MI->isDebugValue())
+ if (!MI->isDebugInstr())
LIS->handleMove(*MI, true);
}
- if (!MI->isDebugValue()) {
+ if (!MI->isDebugInstr()) {
// Reset read - undef flags and update them later.
for (auto &Op : MI->operands())
if (Op.isReg() && Op.isDef())
@@ -417,7 +418,7 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
#ifndef NDEBUG
const auto RegionMaxRP = getRegionPressure(R);
- const auto &ST = MF.getSubtarget<SISubtarget>();
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
#endif
assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP))
|| (dbgs() << "Max RP mismatch!!!\n"
@@ -432,8 +433,8 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
// Sort recorded regions by pressure - highest at the front
void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
- const auto &ST = MF.getSubtarget<SISubtarget>();
- std::sort(Regions.begin(), Regions.end(),
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
+ llvm::sort(Regions.begin(), Regions.end(),
[&ST, TargetOcc](const Region *R1, const Region *R2) {
return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
});
@@ -450,24 +451,24 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
// BestSchedules aren't deleted on fail.
unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// TODO: assert Regions are sorted descending by pressure
- const auto &ST = MF.getSubtarget<SISubtarget>();
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
- DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc
- << ", current = " << Occ << '\n');
+ LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
+ << ", current = " << Occ << '\n');
auto NewOcc = TargetOcc;
for (auto R : Regions) {
if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
break;
- DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
- printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
+ LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
+ printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
BuildDAG DAG(*R, *this);
const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
const auto MaxRP = getSchedulePressure(*R, MinSchedule);
- DEBUG(dbgs() << "Occupancy improvement attempt:\n";
- printSchedRP(dbgs(), R->MaxPressure, MaxRP));
+ LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
+ printSchedRP(dbgs(), R->MaxPressure, MaxRP));
NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
if (NewOcc <= Occ)
@@ -475,15 +476,21 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
setBestSchedule(*R, MinSchedule, MaxRP);
}
- DEBUG(dbgs() << "New occupancy = " << NewOcc
- << ", prev occupancy = " << Occ << '\n');
+ LLVM_DEBUG(dbgs() << "New occupancy = " << NewOcc
+ << ", prev occupancy = " << Occ << '\n');
+ if (NewOcc > Occ) {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MFI->increaseOccupancy(MF, NewOcc);
+ }
+
return std::max(NewOcc, Occ);
}
void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
bool TryMaximizeOccupancy) {
- const auto &ST = MF.getSubtarget<SISubtarget>();
- auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ auto TgtOcc = MFI->getMinAllowedOccupancy();
sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
@@ -496,9 +503,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
const int NumPasses = Occ < TgtOcc ? 2 : 1;
TgtOcc = std::min(Occ, TgtOcc);
- DEBUG(dbgs() << "Scheduling using default scheduler, "
- "target occupancy = " << TgtOcc << '\n');
+ LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
+ "target occupancy = "
+ << TgtOcc << '\n');
GCNMaxOccupancySchedStrategy LStrgy(Context);
+ unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
for (int I = 0; I < NumPasses; ++I) {
// running first pass with TargetOccupancy = 0 mimics previous scheduling
@@ -509,30 +518,33 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
Ovr.schedule();
const auto RP = getRegionPressure(*R);
- DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+ LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
if (RP.getOccupancy(ST) < TgtOcc) {
- DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+ LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() &&
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
- DEBUG(dbgs() << ", scheduling minimal register\n");
+ LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
} else {
- DEBUG(dbgs() << ", restoring\n");
+ LLVM_DEBUG(dbgs() << ", restoring\n");
Ovr.restoreOrder();
assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
}
}
+ FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
}
}
+ MFI->limitOccupancy(FinalOccupancy);
}
///////////////////////////////////////////////////////////////////////////////
// Minimal Register Strategy
void GCNIterativeScheduler::scheduleMinReg(bool force) {
- const auto &ST = MF.getSubtarget<SISubtarget>();
- const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const auto TgtOcc = MFI->getOccupancy();
sortRegionsByPressure(TgtOcc);
auto MaxPressure = Regions.front()->MaxPressure;
@@ -544,7 +556,7 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
const auto RP = getSchedulePressure(*R, MinSchedule);
- DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
+ LLVM_DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
dbgs() << "\nWarning: Pressure becomes worse after minreg!";
printSchedRP(dbgs(), R->MaxPressure, RP);
});
@@ -553,7 +565,7 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
break;
scheduleRegion(*R, MinSchedule, RP);
- DEBUG(printSchedResult(dbgs(), R, RP));
+ LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
MaxPressure = RP;
}
@@ -564,9 +576,9 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
void GCNIterativeScheduler::scheduleILP(
bool TryMaximizeOccupancy) {
- const auto &ST = MF.getSubtarget<SISubtarget>();
- auto TgtOcc = std::min(ST.getOccupancyWithLocalMemSize(MF),
- ST.getWavesPerEU(MF.getFunction()).second);
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ auto TgtOcc = MFI->getMinAllowedOccupancy();
sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
@@ -575,26 +587,30 @@ void GCNIterativeScheduler::scheduleILP(
Occ = tryMaximizeOccupancy(TgtOcc);
TgtOcc = std::min(Occ, TgtOcc);
- DEBUG(dbgs() << "Scheduling using default scheduler, "
- "target occupancy = " << TgtOcc << '\n');
+ LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
+ "target occupancy = "
+ << TgtOcc << '\n');
+ unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
for (auto R : Regions) {
BuildDAG DAG(*R, *this);
const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this);
const auto RP = getSchedulePressure(*R, ILPSchedule);
- DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+ LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
if (RP.getOccupancy(ST) < TgtOcc) {
- DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+ LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() &&
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
- DEBUG(dbgs() << ", scheduling minimal register\n");
+ LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
}
} else {
scheduleRegion(*R, ILPSchedule, RP);
- DEBUG(printSchedResult(dbgs(), R, RP));
+ LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
+ FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
}
}
+ MFI->limitOccupancy(FinalOccupancy);
}
diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index 9904b5f0f4ba..192d534bb9cf 100644
--- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -142,35 +142,38 @@ GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() {
unsigned Num = RQ.size();
if (Num == 1) break;
- DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n');
+ LLVM_DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num
+ << '\n');
Num = findMax(Num, [=](const Candidate &C) { return C.Priority; });
if (Num == 1) break;
- DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
- << Num << '\n');
+ LLVM_DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
+ << Num << '\n');
Num = findMax(Num, [=](const Candidate &C) {
auto SU = C.SU;
int Res = getNotReadySuccessors(SU);
- DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
- << Res << " successors, metric = " << -Res << '\n');
+ LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
+ << Res << " successors, metric = " << -Res << '\n');
return -Res;
});
if (Num == 1) break;
- DEBUG(dbgs() << "\nSelecting most producing candidate among "
- << Num << '\n');
+ LLVM_DEBUG(dbgs() << "\nSelecting most producing candidate among " << Num
+ << '\n');
Num = findMax(Num, [=](const Candidate &C) {
auto SU = C.SU;
auto Res = getReadySuccessors(SU);
- DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready "
- << Res << " successors, metric = " << Res << '\n');
+ LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready " << Res
+ << " successors, metric = " << Res << '\n');
return Res;
});
if (Num == 1) break;
Num = Num ? Num : RQ.size();
- DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among "
- << Num << '\n');
+ LLVM_DEBUG(
+ dbgs()
+ << "\nCan't find best candidate, selecting in program order among "
+ << Num << '\n');
Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; });
assert(Num == 1);
} while (false);
@@ -202,17 +205,17 @@ void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) {
Worklist.push_back(P.getSUnit());
}
}
- DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
- << ")'s non-ready successors of " << Priority
- << " priority in ready queue: ");
+ LLVM_DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
+ << ")'s non-ready successors of " << Priority
+ << " priority in ready queue: ");
const auto SetEnd = Set.end();
for (auto &C : RQ) {
if (Set.find(C.SU) != SetEnd) {
C.Priority = Priority;
- DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
+ LLVM_DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
}
}
- DEBUG(dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << '\n');
}
void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) {
@@ -243,19 +246,19 @@ GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots,
releaseSuccessors(&DAG.EntrySU, StepNo);
while (!RQ.empty()) {
- DEBUG(
- dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n"
- "Ready queue:";
- for (auto &C : RQ)
- dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
- dbgs() << '\n';
- );
+ LLVM_DEBUG(dbgs() << "\n=== Picking candidate, Step = " << StepNo
+ << "\n"
+ "Ready queue:";
+ for (auto &C
+ : RQ) dbgs()
+ << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
+ dbgs() << '\n';);
auto C = pickCandidate();
assert(C);
RQ.remove(*C);
auto SU = C->SU;
- DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+ LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
releaseSuccessors(SU, StepNo);
Schedule.push_back(SU);
diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td
index b2a3f652abd8..d76acfa24f90 100644
--- a/lib/Target/AMDGPU/GCNProcessors.td
+++ b/lib/Target/AMDGPU/GCNProcessors.td
@@ -93,14 +93,6 @@ def : ProcessorModel<"bonaire", SIQuarterSpeedModel,
// GCN GFX8 (Volcanic Islands (VI)).
//===----------------------------------------------------------------------===//
-def : ProcessorModel<"gfx800", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_0]
->;
-
-def : ProcessorModel<"iceland", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_0]
->;
-
def : ProcessorModel<"gfx801", SIQuarterSpeedModel,
[FeatureISAVersion8_0_1]
>;
@@ -113,6 +105,10 @@ def : ProcessorModel<"gfx802", SIQuarterSpeedModel,
[FeatureISAVersion8_0_2]
>;
+def : ProcessorModel<"iceland", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_2]
+>;
+
def : ProcessorModel<"tonga", SIQuarterSpeedModel,
[FeatureISAVersion8_0_2]
>;
@@ -152,3 +148,11 @@ def : ProcessorModel<"gfx900", SIQuarterSpeedModel,
def : ProcessorModel<"gfx902", SIQuarterSpeedModel,
[FeatureISAVersion9_0_2]
>;
+
+def : ProcessorModel<"gfx904", SIQuarterSpeedModel,
+ [FeatureISAVersion9_0_4]
+>;
+
+def : ProcessorModel<"gfx906", SIQuarterSpeedModel,
+ [FeatureISAVersion9_0_6]
+>;
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 992bb7cceb6f..3d8cacc4f02c 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -19,6 +19,7 @@
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
@@ -131,7 +132,7 @@ void GCNRegPressure::inc(unsigned Reg,
}
}
-bool GCNRegPressure::less(const SISubtarget &ST,
+bool GCNRegPressure::less(const GCNSubtarget &ST,
const GCNRegPressure& O,
unsigned MaxOccupancy) const {
const auto SGPROcc = std::min(MaxOccupancy,
@@ -177,7 +178,7 @@ bool GCNRegPressure::less(const SISubtarget &ST,
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
-void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
+void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
OS << "VGPRs: " << getVGPRNum();
if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
OS << ", SGPRs: " << getSGPRNum();
@@ -283,24 +284,33 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
return LiveRegs;
}
-void GCNUpwardRPTracker::reset(const MachineInstr &MI,
- const LiveRegSet *LiveRegsCopy) {
- MRI = &MI.getParent()->getParent()->getRegInfo();
+void GCNRPTracker::reset(const MachineInstr &MI,
+ const LiveRegSet *LiveRegsCopy,
+ bool After) {
+ const MachineFunction &MF = *MI.getMF();
+ MRI = &MF.getRegInfo();
if (LiveRegsCopy) {
if (&LiveRegs != LiveRegsCopy)
LiveRegs = *LiveRegsCopy;
} else {
- LiveRegs = getLiveRegsAfter(MI, LIS);
+ LiveRegs = After ? getLiveRegsAfter(MI, LIS)
+ : getLiveRegsBefore(MI, LIS);
}
+
MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
}
+void GCNUpwardRPTracker::reset(const MachineInstr &MI,
+ const LiveRegSet *LiveRegsCopy) {
+ GCNRPTracker::reset(MI, LiveRegsCopy, true);
+}
+
void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
assert(MRI && "call reset first");
LastTrackedMI = &MI;
- if (MI.isDebugValue())
+ if (MI.isDebugInstr())
return;
auto const RegUses = collectVirtualRegUses(MI, LIS, *MRI);
@@ -348,13 +358,7 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
if (NextMI == MBBEnd)
return false;
- if (LiveRegsCopy) {
- if (&LiveRegs != LiveRegsCopy)
- LiveRegs = *LiveRegsCopy;
- } else {
- LiveRegs = getLiveRegsBefore(*NextMI, LIS);
- }
- MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+ GCNRPTracker::reset(*NextMI, LiveRegsCopy, false);
return true;
}
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index e418aa0fe911..357d3b7b2334 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -49,7 +49,7 @@ struct GCNRegPressure {
unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
- unsigned getOccupancy(const SISubtarget &ST) const {
+ unsigned getOccupancy(const GCNSubtarget &ST) const {
return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
ST.getOccupancyWithNumVGPRs(getVGPRNum()));
}
@@ -59,11 +59,11 @@ struct GCNRegPressure {
LaneBitmask NewMask,
const MachineRegisterInfo &MRI);
- bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const {
+ bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure& O) const {
return getOccupancy(ST) > O.getOccupancy(ST);
}
- bool less(const SISubtarget &ST, const GCNRegPressure& O,
+ bool less(const GCNSubtarget &ST, const GCNRegPressure& O,
unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
bool operator==(const GCNRegPressure &O) const {
@@ -74,7 +74,7 @@ struct GCNRegPressure {
return !(*this == O);
}
- void print(raw_ostream &OS, const SISubtarget *ST = nullptr) const;
+ void print(raw_ostream &OS, const GCNSubtarget *ST = nullptr) const;
void dump() const { print(dbgs()); }
private:
@@ -106,6 +106,9 @@ protected:
GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+ void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy,
+ bool After);
+
public:
// live regs for the current state
const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index d414b899050a..f09b7f6cff22 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -28,18 +28,6 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C) :
GenericScheduler(C), TargetOccupancy(0), MF(nullptr) { }
-static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
- const MachineFunction &MF) {
-
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
- ST.getOccupancyWithNumVGPRs(VGPRs));
- return std::min(MinRegOccupancy,
- ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
- MF.getFunction()));
-}
-
void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
@@ -47,7 +35,7 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
MF = &DAG->MF;
- const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
// FIXME: This is also necessary, because some passes that run after
// scheduling and before regalloc increase register pressure.
@@ -81,7 +69,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
Cand.AtTop = AtTop;
// getDownwardPressure() and getUpwardPressure() make temporary changes to
- // the the tracker, so we need to pass those function a non-const copy.
+ // the tracker, so we need to pass those function a non-const copy.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
std::vector<unsigned> Pressure;
@@ -200,34 +188,30 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
// See if BotCand is still valid (because we previously scheduled from Top).
- DEBUG(dbgs() << "Picking from Bot:\n");
+ LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
if (!BotCand.isValid() || BotCand.SU->isScheduled ||
BotCand.Policy != BotPolicy) {
BotCand.reset(CandPolicy());
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
assert(BotCand.Reason != NoCand && "failed to find the first candidate");
} else {
- DEBUG(traceCandidate(BotCand));
+ LLVM_DEBUG(traceCandidate(BotCand));
}
// Check if the top Q has a better candidate.
- DEBUG(dbgs() << "Picking from Top:\n");
+ LLVM_DEBUG(dbgs() << "Picking from Top:\n");
if (!TopCand.isValid() || TopCand.SU->isScheduled ||
TopCand.Policy != TopPolicy) {
TopCand.reset(CandPolicy());
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
assert(TopCand.Reason != NoCand && "failed to find the first candidate");
} else {
- DEBUG(traceCandidate(TopCand));
+ LLVM_DEBUG(traceCandidate(TopCand));
}
// Pick best from BotCand and TopCand.
- DEBUG(
- dbgs() << "Top Cand: ";
- traceCandidate(TopCand);
- dbgs() << "Bot Cand: ";
- traceCandidate(BotCand);
- );
+ LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
+ dbgs() << "Bot Cand: "; traceCandidate(BotCand););
SchedCandidate Cand;
if (TopCand.Reason == BotCand.Reason) {
Cand = BotCand;
@@ -256,10 +240,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
}
}
}
- DEBUG(
- dbgs() << "Picking: ";
- traceCandidate(Cand);
- );
+ LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
IsTopNode = Cand.AtTop;
return Cand.SU;
@@ -305,20 +286,20 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
if (SU->isBottomReady())
Bot.removeReady(SU);
- DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
+ LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+ << *SU->getInstr());
return SU;
}
GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
std::unique_ptr<MachineSchedStrategy> S) :
ScheduleDAGMILive(C, std::move(S)),
- ST(MF.getSubtarget<SISubtarget>()),
+ ST(MF.getSubtarget<GCNSubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
- StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(),
- MF.getFunction())),
+ StartingOccupancy(MFI.getOccupancy()),
MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
- DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
+ LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
}
void GCNScheduleDAGMILive::schedule() {
@@ -338,12 +319,12 @@ void GCNScheduleDAGMILive::schedule() {
if (LIS) {
PressureBefore = Pressure[RegionIdx];
- DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
- GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
- dbgs() << "Region live-in pressure: ";
- llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
- dbgs() << "Region register pressure: ";
- PressureBefore.print(dbgs()));
+ LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
+ GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
+ dbgs() << "Region live-in pressure: ";
+ llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
+ dbgs() << "Region register pressure: ";
+ PressureBefore.print(dbgs()));
}
ScheduleDAGMILive::schedule();
@@ -356,45 +337,54 @@ void GCNScheduleDAGMILive::schedule() {
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
auto PressureAfter = getRealRegPressure();
- DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs()));
+ LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
+ PressureAfter.print(dbgs()));
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) {
Pressure[RegionIdx] = PressureAfter;
- DEBUG(dbgs() << "Pressure in desired limits, done.\n");
+ LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
return;
}
- unsigned WavesAfter = getMaxWaves(PressureAfter.getSGPRNum(),
- PressureAfter.getVGPRNum(), MF);
- unsigned WavesBefore = getMaxWaves(PressureBefore.getSGPRNum(),
- PressureBefore.getVGPRNum(), MF);
- DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore <<
- ", after " << WavesAfter << ".\n");
+ unsigned Occ = MFI.getOccupancy();
+ unsigned WavesAfter = std::min(Occ, PressureAfter.getOccupancy(ST));
+ unsigned WavesBefore = std::min(Occ, PressureBefore.getOccupancy(ST));
+ LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
+ << ", after " << WavesAfter << ".\n");
// We could not keep current target occupancy because of the just scheduled
// region. Record new occupancy for next scheduling cycle.
unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
+ // Allow memory bound functions to drop to 4 waves if not limited by an
+ // attribute.
+ if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy &&
+ WavesAfter >= MFI.getMinAllowedOccupancy()) {
+ LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
+ << MFI.getMinAllowedOccupancy() << " waves\n");
+ NewOccupancy = WavesAfter;
+ }
if (NewOccupancy < MinOccupancy) {
MinOccupancy = NewOccupancy;
- DEBUG(dbgs() << "Occupancy lowered for the function to "
- << MinOccupancy << ".\n");
+ MFI.limitOccupancy(MinOccupancy);
+ LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
+ << MinOccupancy << ".\n");
}
- if (WavesAfter >= WavesBefore) {
+ if (WavesAfter >= MinOccupancy) {
Pressure[RegionIdx] = PressureAfter;
return;
}
- DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+ LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
RegionEnd = RegionBegin;
for (MachineInstr *MI : Unsched) {
- if (MI->isDebugValue())
+ if (MI->isDebugInstr())
continue;
if (MI->getIterator() != RegionEnd) {
BB->remove(MI);
BB->insert(RegionEnd, MI);
- if (!MI->isDebugValue())
+ if (!MI->isDebugInstr())
LIS->handleMove(*MI, true);
}
// Reset read-undef flags and update them later.
@@ -403,7 +393,7 @@ void GCNScheduleDAGMILive::schedule() {
Op.setIsUndef(false);
RegisterOperands RegOpers;
RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
- if (!MI->isDebugValue()) {
+ if (!MI->isDebugInstr()) {
if (ShouldTrackLaneMasks) {
// Adjust liveness and add missing dead+read-undef flags.
SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
@@ -415,7 +405,7 @@ void GCNScheduleDAGMILive::schedule() {
}
RegionEnd = MI->getIterator();
++RegionEnd;
- DEBUG(dbgs() << "Scheduling " << *MI);
+ LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
}
RegionBegin = Unsched.front()->getIterator();
Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
@@ -490,7 +480,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
void GCNScheduleDAGMILive::finalizeSchedule() {
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
- DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
+ LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
@@ -509,9 +499,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
if (!LIS || StartingOccupancy <= MinOccupancy)
break;
- DEBUG(dbgs()
- << "Retrying function scheduling with lowest recorded occupancy "
- << MinOccupancy << ".\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "Retrying function scheduling with lowest recorded occupancy "
+ << MinOccupancy << ".\n");
S.setTargetOccupancy(MinOccupancy);
}
@@ -537,12 +528,13 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
continue;
}
- DEBUG(dbgs() << "********** MI Scheduling **********\n");
- DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " "
- << MBB->getName() << "\n From: " << *begin() << " To: ";
- if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
- else dbgs() << "End";
- dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+ LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
+ LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " "
+ << MBB->getName() << "\n From: " << *begin()
+ << " To: ";
+ if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+ else dbgs() << "End";
+ dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
schedule();
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index 060d2ca72d93..3ac6af89cb9b 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -21,7 +21,7 @@ namespace llvm {
class SIMachineFunctionInfo;
class SIRegisterInfo;
-class SISubtarget;
+class GCNSubtarget;
/// This is a minimal scheduler strategy. The main difference between this
/// and the GenericScheduler is that GCNSchedStrategy uses different
@@ -62,9 +62,9 @@ public:
class GCNScheduleDAGMILive : public ScheduleDAGMILive {
- const SISubtarget &ST;
+ const GCNSubtarget &ST;
- const SIMachineFunctionInfo &MFI;
+ SIMachineFunctionInfo &MFI;
// Occupancy target at the beginning of function scheduling cycle.
unsigned StartingOccupancy;
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index bf57f88bef91..db908368a179 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -217,6 +217,11 @@ void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
printNamedBit(MI, OpNo, O, "lwe");
}
+void AMDGPUInstPrinter::printD16(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "d16");
+}
+
void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -267,6 +272,9 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
case AMDGPU::FLAT_SCR:
O << "flat_scratch";
return;
+ case AMDGPU::XNACK_MASK:
+ O << "xnack_mask";
+ return;
case AMDGPU::VCC_LO:
O << "vcc_lo";
return;
@@ -297,6 +305,12 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
case AMDGPU::FLAT_SCR_HI:
O << "flat_scratch_hi";
return;
+ case AMDGPU::XNACK_MASK_LO:
+ O << "xnack_mask_lo";
+ return;
+ case AMDGPU::XNACK_MASK_HI:
+ O << "xnack_mask_hi";
+ return;
case AMDGPU::FP_REG:
case AMDGPU::SP_REG:
case AMDGPU::SCRATCH_WAVE_OFFSET_REG:
@@ -371,6 +385,16 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
printOperand(MI, OpNo, STI, O);
}
+void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI))
+ O << " ";
+ else
+ O << "_e32 ";
+
+ printOperand(MI, OpNo, STI, O);
+}
+
void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -486,11 +510,6 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) {
- static_cast<R600InstPrinter*>(this)->printOperand(MI, OpNo, O);
- return;
- }
-
if (OpNo >= MI->getNumOperands()) {
O << "/*Missing OP" << OpNo << "*/";
return;
@@ -612,40 +631,45 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ using namespace AMDGPU::DPP;
+
unsigned Imm = MI->getOperand(OpNo).getImm();
- if (Imm <= 0x0ff) {
+ if (Imm <= DppCtrl::QUAD_PERM_LAST) {
O << " quad_perm:[";
O << formatDec(Imm & 0x3) << ',';
O << formatDec((Imm & 0xc) >> 2) << ',';
O << formatDec((Imm & 0x30) >> 4) << ',';
O << formatDec((Imm & 0xc0) >> 6) << ']';
- } else if ((Imm >= 0x101) && (Imm <= 0x10f)) {
+ } else if ((Imm >= DppCtrl::ROW_SHL_FIRST) &&
+ (Imm <= DppCtrl::ROW_SHL_LAST)) {
O << " row_shl:";
printU4ImmDecOperand(MI, OpNo, O);
- } else if ((Imm >= 0x111) && (Imm <= 0x11f)) {
+ } else if ((Imm >= DppCtrl::ROW_SHR_FIRST) &&
+ (Imm <= DppCtrl::ROW_SHR_LAST)) {
O << " row_shr:";
printU4ImmDecOperand(MI, OpNo, O);
- } else if ((Imm >= 0x121) && (Imm <= 0x12f)) {
+ } else if ((Imm >= DppCtrl::ROW_ROR_FIRST) &&
+ (Imm <= DppCtrl::ROW_ROR_LAST)) {
O << " row_ror:";
printU4ImmDecOperand(MI, OpNo, O);
- } else if (Imm == 0x130) {
+ } else if (Imm == DppCtrl::WAVE_SHL1) {
O << " wave_shl:1";
- } else if (Imm == 0x134) {
+ } else if (Imm == DppCtrl::WAVE_ROL1) {
O << " wave_rol:1";
- } else if (Imm == 0x138) {
+ } else if (Imm == DppCtrl::WAVE_SHR1) {
O << " wave_shr:1";
- } else if (Imm == 0x13c) {
+ } else if (Imm == DppCtrl::WAVE_ROR1) {
O << " wave_ror:1";
- } else if (Imm == 0x140) {
+ } else if (Imm == DppCtrl::ROW_MIRROR) {
O << " row_mirror";
- } else if (Imm == 0x141) {
+ } else if (Imm == DppCtrl::ROW_HALF_MIRROR) {
O << " row_half_mirror";
- } else if (Imm == 0x142) {
+ } else if (Imm == DppCtrl::BCAST15) {
O << " row_bcast:15";
- } else if (Imm == 0x143) {
+ } else if (Imm == DppCtrl::BCAST31) {
O << " row_bcast:31";
} else {
- llvm_unreachable("Invalid dpp_ctrl value");
+ O << " /* Invalid dpp_ctrl value */";
}
}
@@ -936,11 +960,6 @@ void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) {
- static_cast<R600InstPrinter*>(this)->printMemOperand(MI, OpNo, O);
- return;
- }
-
printOperand(MI, OpNo, STI, O);
O << ", ";
printOperand(MI, OpNo + 1, STI, O);
@@ -966,16 +985,6 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
O << Asm;
}
-void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printAbs(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printClamp(MI, OpNo, O);
-}
-
void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -1002,70 +1011,6 @@ void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
O << " div:2";
}
-void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printLiteral(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printLast(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printNeg(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printOMOD(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printRel(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printUpdateExecMask(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printUpdatePred(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printWrite(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printBankSwizzle(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printRSel(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printCT(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- static_cast<R600InstPrinter*>(this)->printKCache(MI, OpNo, O);
-}
-
void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -1254,7 +1199,10 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
O << "hwreg(";
- if (ID_SYMBOLIC_FIRST_ <= Id && Id < ID_SYMBOLIC_LAST_) {
+ unsigned Last = ID_SYMBOLIC_LAST_;
+ if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI) || AMDGPU::isVI(STI))
+ Last = ID_SYMBOLIC_FIRST_GFX9_;
+ if (ID_SYMBOLIC_FIRST_ <= Id && Id < Last && IdSymbolic[Id]) {
O << IdSymbolic[Id];
} else {
O << Id;
@@ -1267,6 +1215,13 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
#include "AMDGPUGenAsmWriter.inc"
+void R600InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ O.flush();
+ printInstruction(MI, O);
+ printAnnotation(O, Annot);
+}
+
void R600InstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '|');
@@ -1385,7 +1340,7 @@ void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
if (Op.isReg()) {
switch (Op.getReg()) {
// This is the default predicate state, so we don't need to print it.
- case AMDGPU::PRED_SEL_OFF:
+ case R600::PRED_SEL_OFF:
break;
default:
@@ -1461,3 +1416,5 @@ void R600InstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
O << " (MASKED)";
}
}
+
+#include "R600GenAsmWriter.inc"
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index d97f04689e18..11a496a38b2c 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -84,6 +84,8 @@ private:
raw_ostream &O);
void printLWE(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printD16(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printExpCompr(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printExpVM(const MCInst *MI, unsigned OpNo,
@@ -96,6 +98,8 @@ private:
void printRegOperand(unsigned RegNo, raw_ostream &O);
void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
@@ -214,13 +218,16 @@ protected:
raw_ostream &O);
};
-// FIXME: R600 specific parts of AMDGPUInstrPrinter should be moved here, and
-// MCTargetDesc should be using R600InstPrinter for the R600 target.
-class R600InstPrinter : public AMDGPUInstPrinter {
+class R600InstPrinter : public MCInstPrinter {
public:
R600InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
const MCRegisterInfo &MRI)
- : AMDGPUInstPrinter(MAI, MII, MRI) {}
+ : MCInstPrinter(MAI, MII, MRI) {}
+
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 778d4a7ba9d0..abc88c02adca 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -26,14 +26,14 @@ namespace {
class AMDGPUAsmBackend : public MCAsmBackend {
public:
- AMDGPUAsmBackend(const Target &T)
- : MCAsmBackend() {}
+ AMDGPUAsmBackend(const Target &T) : MCAsmBackend(support::little) {}
unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) const override;
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override;
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override {
@@ -43,10 +43,13 @@ public:
MCInst &Res) const override {
llvm_unreachable("Not implemented");
}
- bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override {
+ return false;
+ }
unsigned getMinimumNopSize() const override;
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
};
@@ -103,7 +106,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target,
MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) const {
+ bool IsResolved,
+ const MCSubtargetInfo *STI) const {
Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
if (!Value)
return; // Doesn't change encoding.
@@ -140,11 +144,11 @@ unsigned AMDGPUAsmBackend::getMinimumNopSize() const {
return 4;
}
-bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool AMDGPUAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
// If the count is not 4-byte aligned, we must be writing data into the text
// section (otherwise we have unaligned instructions, and thus have far
// bigger problems), so just write zeros instead.
- OW->WriteZeros(Count % 4);
+ OS.write_zeros(Count % 4);
// We are properly aligned, so write NOPs as requested.
Count /= 4;
@@ -154,7 +158,7 @@ bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
const uint32_t Encoded_S_NOP_0 = 0xbf800000;
for (uint64_t I = 0; I != Count; ++I)
- OW->write32(Encoded_S_NOP_0);
+ support::endian::write<uint32_t>(OS, Encoded_S_NOP_0, Endian);
return true;
}
@@ -189,18 +193,18 @@ public:
}
}
- std::unique_ptr<MCObjectWriter>
- createObjectWriter(raw_pwrite_stream &OS) const override {
- return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend, OS);
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend);
}
};
} // end anonymous namespace
MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
const MCTargetOptions &Options) {
// Use 64-bit ELF for amdgcn
- return new ELFAMDGPUAsmBackend(T, TT);
+ return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple());
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index e443b0729606..07bef9103c0d 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -66,6 +66,8 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_AMDGPU_REL32_LO;
case MCSymbolRefExpr::VK_AMDGPU_REL32_HI:
return ELF::R_AMDGPU_REL32_HI;
+ case MCSymbolRefExpr::VK_AMDGPU_REL64:
+ return ELF::R_AMDGPU_REL64;
}
switch (Fixup.getKind()) {
@@ -82,11 +84,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
llvm_unreachable("unhandled relocation type");
}
-std::unique_ptr<MCObjectWriter>
+std::unique_ptr<MCObjectTargetWriter>
llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
- bool HasRelocationAddend,
- raw_pwrite_stream &OS) {
- auto MOTW = llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
- HasRelocationAddend);
- return createELFObjectWriter(std::move(MOTW), OS, true);
+ bool HasRelocationAddend) {
+ return llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
+ HasRelocationAddend);
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
index 1497edc7a054..c627a08e7463 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -12,37 +12,28 @@
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectWriter.h"
using namespace llvm;
-AMDGPUELFStreamer::AMDGPUELFStreamer(const Triple &T, MCContext &Context,
- std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS,
- std::unique_ptr<MCCodeEmitter> Emitter)
- : MCELFStreamer(Context, std::move(MAB), OS, std::move(Emitter)) {
- unsigned Arch = ELF::EF_AMDGPU_ARCH_NONE;
- switch (T.getArch()) {
- case Triple::r600:
- Arch = ELF::EF_AMDGPU_ARCH_R600;
- break;
- case Triple::amdgcn:
- Arch = ELF::EF_AMDGPU_ARCH_GCN;
- break;
- default:
- break;
- }
+namespace {
+
+class AMDGPUELFStreamer : public MCELFStreamer {
+public:
+ AMDGPUELFStreamer(const Triple &T, MCContext &Context,
+ std::unique_ptr<MCAsmBackend> MAB,
+ std::unique_ptr<MCObjectWriter> OW,
+ std::unique_ptr<MCCodeEmitter> Emitter)
+ : MCELFStreamer(Context, std::move(MAB), std::move(OW),
+ std::move(Emitter)) {}
+};
- MCAssembler &MCA = getAssembler();
- unsigned EFlags = MCA.getELFHeaderEFlags();
- EFlags &= ~ELF::EF_AMDGPU_ARCH;
- EFlags |= Arch;
- MCA.setELFHeaderEFlags(EFlags);
}
MCELFStreamer *llvm::createAMDGPUELFStreamer(
const Triple &T, MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+ std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll) {
- return new AMDGPUELFStreamer(T, Context, std::move(MAB), OS,
+ return new AMDGPUELFStreamer(T, Context, std::move(MAB), std::move(OW),
std::move(Emitter));
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
index 0cc0a4c5cd5d..41e9063a759e 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -23,16 +23,9 @@ class MCCodeEmitter;
class MCContext;
class MCSubtargetInfo;
-class AMDGPUELFStreamer : public MCELFStreamer {
-public:
- AMDGPUELFStreamer(const Triple &T, MCContext &Context,
- std::unique_ptr<MCAsmBackend> MAB, raw_pwrite_stream &OS,
- std::unique_ptr<MCCodeEmitter> Emitter);
-};
-
MCELFStreamer *createAMDGPUELFStreamer(const Triple &T, MCContext &Context,
std::unique_ptr<MCAsmBackend> MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter,
bool RelaxAll);
} // namespace llvm.
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 521b3b39bba2..cae7a7a6c7e7 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief CodeEmitter interface for R600 and SI codegen.
+/// CodeEmitter interface for R600 and SI codegen.
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 1b062064ace1..dcc10a032afe 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief CodeEmitter interface for R600 and SI codegen.
+/// CodeEmitter interface for R600 and SI codegen.
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 2b321c04fb30..c579c7d60e16 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This file provides AMDGPU specific target descriptions.
+/// This file provides AMDGPU specific target descriptions.
//
//===----------------------------------------------------------------------===//
@@ -22,6 +22,7 @@
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -37,9 +38,17 @@ using namespace llvm;
#define GET_SUBTARGETINFO_MC_DESC
#include "AMDGPUGenSubtargetInfo.inc"
+#define NoSchedModel NoSchedModelR600
+#define GET_SUBTARGETINFO_MC_DESC
+#include "R600GenSubtargetInfo.inc"
+#undef NoSchedModelR600
+
#define GET_REGINFO_MC_DESC
#include "AMDGPUGenRegisterInfo.inc"
+#define GET_REGINFO_MC_DESC
+#include "R600GenRegisterInfo.inc"
+
static MCInstrInfo *createAMDGPUMCInstrInfo() {
MCInstrInfo *X = new MCInstrInfo();
InitAMDGPUMCInstrInfo(X);
@@ -48,12 +57,17 @@ static MCInstrInfo *createAMDGPUMCInstrInfo() {
static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) {
MCRegisterInfo *X = new MCRegisterInfo();
- InitAMDGPUMCRegisterInfo(X, 0);
+ if (TT.getArch() == Triple::r600)
+ InitR600MCRegisterInfo(X, 0);
+ else
+ InitAMDGPUMCRegisterInfo(X, 0);
return X;
}
static MCSubtargetInfo *
createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+ if (TT.getArch() == Triple::r600)
+ return createR600MCSubtargetInfoImpl(TT, CPU, FS);
return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS);
}
@@ -62,8 +76,10 @@ static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
const MCAsmInfo &MAI,
const MCInstrInfo &MII,
const MCRegisterInfo &MRI) {
- return T.getArch() == Triple::r600 ? new R600InstPrinter(MAI, MII, MRI) :
- new AMDGPUInstPrinter(MAI, MII, MRI);
+ if (T.getArch() == Triple::r600)
+ return new R600InstPrinter(MAI, MII, MRI);
+ else
+ return new AMDGPUInstPrinter(MAI, MII, MRI);
}
static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S,
@@ -76,23 +92,25 @@ static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S,
static MCTargetStreamer * createAMDGPUObjectTargetStreamer(
MCStreamer &S,
const MCSubtargetInfo &STI) {
- return new AMDGPUTargetELFStreamer(S);
+ return new AMDGPUTargetELFStreamer(S, STI);
}
static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
std::unique_ptr<MCAsmBackend> &&MAB,
- raw_pwrite_stream &OS,
+ std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter,
bool RelaxAll) {
- return createAMDGPUELFStreamer(T, Context, std::move(MAB), OS,
+ return createAMDGPUELFStreamer(T, Context, std::move(MAB), std::move(OW),
std::move(Emitter), RelaxAll);
}
extern "C" void LLVMInitializeAMDGPUTargetMC() {
+
+ TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo);
+ TargetRegistry::RegisterMCInstrInfo(getTheAMDGPUTarget(), createR600MCInstrInfo);
for (Target *T : {&getTheAMDGPUTarget(), &getTheGCNTarget()}) {
RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
- TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo);
TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo);
TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
@@ -103,6 +121,8 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() {
// R600 specific registration
TargetRegistry::RegisterMCCodeEmitter(getTheAMDGPUTarget(),
createR600MCCodeEmitter);
+ TargetRegistry::RegisterObjectTargetStreamer(
+ getTheAMDGPUTarget(), createAMDGPUObjectTargetStreamer);
// GCN specific registration
TargetRegistry::RegisterMCCodeEmitter(getTheGCNTarget(),
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 0b3563303ad0..f3628d96d6e9 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Provides AMDGPU specific target descriptions.
+/// Provides AMDGPU specific target descriptions.
//
//===----------------------------------------------------------------------===//
//
@@ -25,7 +25,7 @@ class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
class MCTargetOptions;
@@ -40,24 +40,30 @@ Target &getTheGCNTarget();
MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
+MCInstrInfo *createR600MCInstrInfo();
MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
-MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU,
+MCAsmBackend *createAMDGPUAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
const MCTargetOptions &Options);
-std::unique_ptr<MCObjectWriter>
+std::unique_ptr<MCObjectTargetWriter>
createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
- bool HasRelocationAddend, raw_pwrite_stream &OS);
+ bool HasRelocationAddend);
} // End llvm namespace
#define GET_REGINFO_ENUM
#include "AMDGPUGenRegisterInfo.inc"
#undef GET_REGINFO_ENUM
+#define GET_REGINFO_ENUM
+#include "R600GenRegisterInfo.inc"
+#undef GET_REGINFO_ENUM
+
#define GET_INSTRINFO_ENUM
#define GET_INSTRINFO_OPERAND_ENUM
#define GET_INSTRINFO_SCHED_ENUM
@@ -66,9 +72,20 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
#undef GET_INSTRINFO_OPERAND_ENUM
#undef GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_OPERAND_ENUM
+#define GET_INSTRINFO_SCHED_ENUM
+#include "R600GenInstrInfo.inc"
+#undef GET_INSTRINFO_SCHED_ENUM
+#undef GET_INSTRINFO_OPERAND_ENUM
+#undef GET_INSTRINFO_ENUM
#define GET_SUBTARGETINFO_ENUM
#include "AMDGPUGenSubtargetInfo.inc"
#undef GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_ENUM
+#include "R600GenSubtargetInfo.inc"
+#undef GET_SUBTARGETINFO_ENUM
+
#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index d897956daccf..6a41e3f650bc 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -39,6 +39,84 @@ using namespace llvm::AMDGPU;
// AMDGPUTargetStreamer
//===----------------------------------------------------------------------===//
+static const struct {
+ const char *Name;
+ unsigned Mach;
+} MachTable[] = {
+ // Radeon HD 2000/3000 Series (R600).
+ { "r600", ELF::EF_AMDGPU_MACH_R600_R600 },
+ { "r630", ELF::EF_AMDGPU_MACH_R600_R630 },
+ { "rs880", ELF::EF_AMDGPU_MACH_R600_RS880 },
+ { "rv670", ELF::EF_AMDGPU_MACH_R600_RV670 },
+ // Radeon HD 4000 Series (R700).
+ { "rv710", ELF::EF_AMDGPU_MACH_R600_RV710 },
+ { "rv730", ELF::EF_AMDGPU_MACH_R600_RV730 },
+ { "rv770", ELF::EF_AMDGPU_MACH_R600_RV770 },
+ // Radeon HD 5000 Series (Evergreen).
+ { "cedar", ELF::EF_AMDGPU_MACH_R600_CEDAR },
+ { "cypress", ELF::EF_AMDGPU_MACH_R600_CYPRESS },
+ { "juniper", ELF::EF_AMDGPU_MACH_R600_JUNIPER },
+ { "redwood", ELF::EF_AMDGPU_MACH_R600_REDWOOD },
+ { "sumo", ELF::EF_AMDGPU_MACH_R600_SUMO },
+ // Radeon HD 6000 Series (Northern Islands).
+ { "barts", ELF::EF_AMDGPU_MACH_R600_BARTS },
+ { "caicos", ELF::EF_AMDGPU_MACH_R600_CAICOS },
+ { "cayman", ELF::EF_AMDGPU_MACH_R600_CAYMAN },
+ { "turks", ELF::EF_AMDGPU_MACH_R600_TURKS },
+ // AMDGCN GFX6.
+ { "gfx600", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 },
+ { "tahiti", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 },
+ { "gfx601", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+ { "hainan", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+ { "oland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+ { "pitcairn", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+ { "verde", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+ // AMDGCN GFX7.
+ { "gfx700", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 },
+ { "kaveri", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 },
+ { "gfx701", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 },
+ { "hawaii", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 },
+ { "gfx702", ELF::EF_AMDGPU_MACH_AMDGCN_GFX702 },
+ { "gfx703", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
+ { "kabini", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
+ { "mullins", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
+ { "gfx704", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 },
+ { "bonaire", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 },
+ // AMDGCN GFX8.
+ { "gfx801", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 },
+ { "carrizo", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 },
+ { "gfx802", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
+ { "iceland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
+ { "tonga", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
+ { "gfx803", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+ { "fiji", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+ { "polaris10", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+ { "polaris11", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+ { "gfx810", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 },
+ { "stoney", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 },
+ // AMDGCN GFX9.
+ { "gfx900", ELF::EF_AMDGPU_MACH_AMDGCN_GFX900 },
+ { "gfx902", ELF::EF_AMDGPU_MACH_AMDGCN_GFX902 },
+ { "gfx904", ELF::EF_AMDGPU_MACH_AMDGCN_GFX904 },
+ { "gfx906", ELF::EF_AMDGPU_MACH_AMDGCN_GFX906 },
+ // Not specified processor.
+ { nullptr, ELF::EF_AMDGPU_MACH_NONE }
+};
+
+unsigned AMDGPUTargetStreamer::getMACH(StringRef GPU) const {
+ auto Entry = MachTable;
+ for (; Entry->Name && GPU != Entry->Name; ++Entry)
+ ;
+ return Entry->Mach;
+}
+
+const char *AMDGPUTargetStreamer::getMachName(unsigned Mach) {
+ auto Entry = MachTable;
+ for (; Entry->Name && Mach != Entry->Mach; ++Entry)
+ ;
+ return Entry->Name;
+}
+
bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) {
HSAMD::Metadata HSAMetadata;
if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
@@ -55,9 +133,12 @@ AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S,
formatted_raw_ostream &OS)
: AMDGPUTargetStreamer(S), OS(OS) { }
-void
-AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
- uint32_t Minor) {
+void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {
+ OS << "\t.amdgcn_target \"" << Target << "\"\n";
+}
+
+void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(
+ uint32_t Major, uint32_t Minor) {
OS << "\t.hsa_code_object_version " <<
Twine(Major) << "," << Twine(Minor) << '\n';
}
@@ -118,12 +199,157 @@ bool AMDGPUTargetAsmStreamer::EmitPALMetadata(
return true;
}
+void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
+ const MCSubtargetInfo &STI, StringRef KernelName,
+ const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
+ bool ReserveVCC, bool ReserveFlatScr, bool ReserveXNACK) {
+ amdhsa::kernel_descriptor_t DefaultKD = getDefaultAmdhsaKernelDescriptor();
+
+ IsaInfo::IsaVersion IVersion = IsaInfo::getIsaVersion(STI.getFeatureBits());
+
+ OS << "\t.amdhsa_kernel " << KernelName << '\n';
+
+#define PRINT_IF_NOT_DEFAULT(STREAM, DIRECTIVE, KERNEL_DESC, \
+ DEFAULT_KERNEL_DESC, MEMBER_NAME, FIELD_NAME) \
+ if (AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) != \
+ AMDHSA_BITS_GET(DEFAULT_KERNEL_DESC.MEMBER_NAME, FIELD_NAME)) \
+ STREAM << "\t\t" << DIRECTIVE << " " \
+ << AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) << '\n';
+
+ if (KD.group_segment_fixed_size != DefaultKD.group_segment_fixed_size)
+ OS << "\t\t.amdhsa_group_segment_fixed_size " << KD.group_segment_fixed_size
+ << '\n';
+ if (KD.private_segment_fixed_size != DefaultKD.private_segment_fixed_size)
+ OS << "\t\t.amdhsa_private_segment_fixed_size "
+ << KD.private_segment_fixed_size << '\n';
+
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_queue_ptr", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_user_sgpr_kernarg_segment_ptr", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_id", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_user_sgpr_private_segment_size", KD, DefaultKD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_y", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_z", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_info", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_vgpr_workitem_id", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
+
+ // These directives are required.
+ OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
+ OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n';
+
+ if (!ReserveVCC)
+ OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
+ if (IVersion.Major >= 7 && !ReserveFlatScr)
+ OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
+ if (IVersion.Major >= 8 && ReserveXNACK != hasXNACK(STI))
+ OS << "\t\t.amdhsa_reserve_xnack_mask " << ReserveXNACK << '\n';
+
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_32", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_16_64", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_32", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_16_64", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_dx10_clamp", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_ieee_mode", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
+ if (IVersion.Major >= 9)
+ PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_fp16_overflow", KD, DefaultKD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_fp_denorm_src", KD, DefaultKD, compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_fp_ieee_div_zero", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_fp_ieee_overflow", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_fp_ieee_underflow", KD, DefaultKD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_fp_ieee_inexact", KD, DefaultKD, compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
+ PRINT_IF_NOT_DEFAULT(
+ OS, ".amdhsa_exception_int_div_zero", KD, DefaultKD, compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
+#undef PRINT_IF_NOT_DEFAULT
+
+ OS << "\t.end_amdhsa_kernel\n";
+}
+
//===----------------------------------------------------------------------===//
// AMDGPUTargetELFStreamer
//===----------------------------------------------------------------------===//
-AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S)
- : AMDGPUTargetStreamer(S), Streamer(S) {}
+AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(
+ MCStreamer &S, const MCSubtargetInfo &STI)
+ : AMDGPUTargetStreamer(S), Streamer(S) {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ unsigned EFlags = MCA.getELFHeaderEFlags();
+
+ EFlags &= ~ELF::EF_AMDGPU_MACH;
+ EFlags |= getMACH(STI.getCPU());
+
+ EFlags &= ~ELF::EF_AMDGPU_XNACK;
+ if (AMDGPU::hasXNACK(STI))
+ EFlags |= ELF::EF_AMDGPU_XNACK;
+
+ MCA.setELFHeaderEFlags(EFlags);
+}
MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
return static_cast<MCELFStreamer &>(Streamer);
@@ -150,9 +376,10 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
S.PopSection();
}
-void
-AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
- uint32_t Minor) {
+void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {}
+
+void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
+ uint32_t Major, uint32_t Minor) {
EmitAMDGPUNote(
MCConstantExpr::create(8, getContext()),
@@ -207,7 +434,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
unsigned Type) {
MCSymbolELF *Symbol = cast<MCSymbolELF>(
getStreamer().getContext().getOrCreateSymbol(SymbolName));
- Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
+ Symbol->setType(Type);
}
bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
@@ -271,3 +498,46 @@ bool AMDGPUTargetELFStreamer::EmitPALMetadata(
);
return true;
}
+
+void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
+ const MCSubtargetInfo &STI, StringRef KernelName,
+ const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+ bool ReserveXNACK) {
+ auto &Streamer = getStreamer();
+ auto &Context = Streamer.getContext();
+
+ MCSymbolELF *KernelDescriptorSymbol = cast<MCSymbolELF>(
+ Context.getOrCreateSymbol(Twine(KernelName) + Twine(".kd")));
+ KernelDescriptorSymbol->setBinding(ELF::STB_GLOBAL);
+ KernelDescriptorSymbol->setType(ELF::STT_OBJECT);
+ KernelDescriptorSymbol->setSize(
+ MCConstantExpr::create(sizeof(KernelDescriptor), Context));
+
+ MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>(
+ Context.getOrCreateSymbol(Twine(KernelName)));
+ KernelCodeSymbol->setBinding(ELF::STB_LOCAL);
+
+ Streamer.EmitLabel(KernelDescriptorSymbol);
+ Streamer.EmitBytes(StringRef(
+ (const char*)&(KernelDescriptor),
+ offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset)));
+ // FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The
+ // expression being created is:
+ // (start of kernel code) - (start of kernel descriptor)
+ // It implies R_AMDGPU_REL64, but ends up being R_AMDGPU_ABS64.
+ Streamer.EmitValue(MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(
+ KernelCodeSymbol, MCSymbolRefExpr::VK_AMDGPU_REL64, Context),
+ MCSymbolRefExpr::create(
+ KernelDescriptorSymbol, MCSymbolRefExpr::VK_None, Context),
+ Context),
+ sizeof(KernelDescriptor.kernel_code_entry_byte_offset));
+ Streamer.EmitBytes(StringRef(
+ (const char*)&(KernelDescriptor) +
+ offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) +
+ sizeof(KernelDescriptor.kernel_code_entry_byte_offset),
+ sizeof(KernelDescriptor) -
+ offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) -
+ sizeof(KernelDescriptor.kernel_code_entry_byte_offset)));
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 0919b754480d..472da1b73593 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -14,6 +14,7 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
namespace llvm {
#include "AMDGPUPTNote.h"
@@ -30,9 +31,17 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
protected:
MCContext &getContext() const { return Streamer.getContext(); }
+ /// \returns Equivalent EF_AMDGPU_MACH_* value for given \p GPU name.
+ unsigned getMACH(StringRef GPU) const;
+
public:
+ /// \returns Equivalent GPU name for an EF_AMDGPU_MACH_* value.
+ static const char *getMachName(unsigned Mach);
+
AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+ virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0;
+
virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) = 0;
@@ -56,12 +65,21 @@ public:
/// \returns True on success, false on failure.
virtual bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) = 0;
+
+ virtual void EmitAmdhsaKernelDescriptor(
+ const MCSubtargetInfo &STI, StringRef KernelName,
+ const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+ bool ReserveXNACK) = 0;
};
class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
formatted_raw_ostream &OS;
public:
AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+
+ void EmitDirectiveAMDGCNTarget(StringRef Target) override;
+
void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) override;
@@ -81,6 +99,12 @@ public:
/// \returns True on success, false on failure.
bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+
+ void EmitAmdhsaKernelDescriptor(
+ const MCSubtargetInfo &STI, StringRef KernelName,
+ const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+ bool ReserveXNACK) override;
};
class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
@@ -90,10 +114,12 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
function_ref<void(MCELFStreamer &)> EmitDesc);
public:
- AMDGPUTargetELFStreamer(MCStreamer &S);
+ AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
MCELFStreamer &getStreamer();
+ void EmitDirectiveAMDGCNTarget(StringRef Target) override;
+
void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) override;
@@ -113,6 +139,12 @@ public:
/// \returns True on success, false on failure.
bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+
+ void EmitAmdhsaKernelDescriptor(
+ const MCSubtargetInfo &STI, StringRef KernelName,
+ const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+ bool ReserveXNACK) override;
};
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
index f9cb4678dc51..2d201bbbd7b8 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
@@ -2,11 +2,11 @@ add_llvm_library(LLVMAMDGPUDesc
AMDGPUAsmBackend.cpp
AMDGPUELFObjectWriter.cpp
AMDGPUELFStreamer.cpp
- AMDGPUHSAMetadataStreamer.cpp
AMDGPUMCAsmInfo.cpp
AMDGPUMCCodeEmitter.cpp
AMDGPUMCTargetDesc.cpp
AMDGPUTargetStreamer.cpp
R600MCCodeEmitter.cpp
+ R600MCTargetDesc.cpp
SIMCCodeEmitter.cpp
)
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index eab90e1d344c..28d4bc1829e2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -9,13 +9,12 @@
//
/// \file
///
-/// \brief The R600 code emitter produces machine code that can be executed
+/// The R600 code emitter produces machine code that can be executed
/// directly on the GPU device.
//
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/AMDGPUFixupKinds.h"
-#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "R600Defines.h"
#include "llvm/MC/MCCodeEmitter.h"
@@ -36,30 +35,40 @@ using namespace llvm;
namespace {
-class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
+class R600MCCodeEmitter : public MCCodeEmitter {
const MCRegisterInfo &MRI;
+ const MCInstrInfo &MCII;
public:
R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
- : AMDGPUMCCodeEmitter(mcii), MRI(mri) {}
+ : MRI(mri), MCII(mcii) {}
R600MCCodeEmitter(const R600MCCodeEmitter &) = delete;
R600MCCodeEmitter &operator=(const R600MCCodeEmitter &) = delete;
- /// \brief Encode the instruction and write it to the OS.
+ /// Encode the instruction and write it to the OS.
void encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
+ const MCSubtargetInfo &STI) const;
/// \returns the encoding for an MCOperand.
uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
+ const MCSubtargetInfo &STI) const;
private:
+
void Emit(uint32_t value, raw_ostream &OS) const;
void Emit(uint64_t value, raw_ostream &OS) const;
unsigned getHWReg(unsigned regNo) const;
+
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+ void verifyInstructionPredicates(const MCInst &MI,
+ uint64_t AvailableFeatures) const;
+
};
} // end anonymous namespace
@@ -94,16 +103,16 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
computeAvailableFeatures(STI.getFeatureBits()));
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
- if (MI.getOpcode() == AMDGPU::RETURN ||
- MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
- MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
- MI.getOpcode() == AMDGPU::BUNDLE ||
- MI.getOpcode() == AMDGPU::KILL) {
+ if (MI.getOpcode() == R600::RETURN ||
+ MI.getOpcode() == R600::FETCH_CLAUSE ||
+ MI.getOpcode() == R600::ALU_CLAUSE ||
+ MI.getOpcode() == R600::BUNDLE ||
+ MI.getOpcode() == R600::KILL) {
return;
} else if (IS_VTX(Desc)) {
uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI);
uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
- if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) {
+ if (!(STI.getFeatureBits()[R600::FeatureCaymanISA])) {
InstWord2 |= 1 << 19; // Mega-Fetch bit
}
@@ -136,7 +145,7 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
Emit((uint32_t) 0, OS);
} else {
uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI);
- if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) &&
+ if ((STI.getFeatureBits()[R600::FeatureR600ALUInst]) &&
((Desc.TSFlags & R600_InstFlag::OP1) ||
Desc.TSFlags & R600_InstFlag::OP2)) {
uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
@@ -148,11 +157,11 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
}
void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
- support::endian::Writer<support::little>(OS).write(Value);
+ support::endian::write(OS, Value, support::little);
}
void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
- support::endian::Writer<support::little>(OS).write(Value);
+ support::endian::write(OS, Value, support::little);
}
unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
@@ -186,4 +195,4 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
}
#define ENABLE_INSTR_PREDICATE_VERIFIER
-#include "AMDGPUGenMCCodeEmitter.inc"
+#include "R600GenMCCodeEmitter.inc"
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
new file mode 100644
index 000000000000..1c99a708e5ac
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
@@ -0,0 +1,27 @@
+//===-- R600MCTargetDesc.cpp - R600 Target Descriptions -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This file provides R600 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "R600GenInstrInfo.inc"
+
+MCInstrInfo *llvm::createR600MCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitR600MCInstrInfo(X);
+ return X;
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 94c0157edeb5..36913bd04274 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief The SI code emitter produces machine code that can be executed
+/// The SI code emitter produces machine code that can be executed
/// directly on the GPU device.
//
//===----------------------------------------------------------------------===//
@@ -43,7 +43,7 @@ namespace {
class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
const MCRegisterInfo &MRI;
- /// \brief Encode an fp or int literal
+ /// Encode an fp or int literal
uint32_t getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo,
const MCSubtargetInfo &STI) const;
@@ -54,7 +54,7 @@ public:
SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete;
- /// \brief Encode the instruction and write it to the OS.
+ /// Encode the instruction and write it to the OS.
void encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
@@ -64,7 +64,7 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
- /// \brief Use a fixup to encode the simm16 field for SOPP branch
+ /// Use a fixup to encode the simm16 field for SOPP branch
/// instructions.
unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
@@ -335,13 +335,24 @@ SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
const MCOperand &MO = MI.getOperand(OpNo);
- unsigned Reg = MO.getReg();
- RegEnc |= MRI.getEncodingValue(Reg);
- RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
- if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
- RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ RegEnc |= MRI.getEncodingValue(Reg);
+ RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
+ if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
+ RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
+ }
+ return RegEnc;
+ } else {
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
+ if (Enc != ~0U && Enc != 255) {
+ return Enc | SDWA9EncValues::SRC_SGPR_MASK;
+ }
}
- return RegEnc;
+
+ llvm_unreachable("Unsupported operand kind");
+ return 0;
}
unsigned
@@ -427,3 +438,6 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
llvm_unreachable("Encoding of this operand type is not supported yet.");
return 0;
}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 30a2df510386..1e0bc62c45a6 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -7,9 +7,63 @@
//
//===----------------------------------------------------------------------===//
-class MIMG_Mask <string op, int channels> {
- string Op = op;
- int Channels = channels;
+// MIMG-specific encoding families to distinguish between semantically
+// equivalent machine instructions with different encoding.
+//
+// - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8)
+// - MIMGEncGfx8: encoding introduced with gfx8 for atomics
+class MIMGEncoding;
+
+def MIMGEncGfx6 : MIMGEncoding;
+def MIMGEncGfx8 : MIMGEncoding;
+
+def MIMGEncoding : GenericEnum {
+ let FilterClass = "MIMGEncoding";
+}
+
+// Represent an ISA-level opcode, independent of the encoding and the
+// vdata/vaddr size.
+class MIMGBaseOpcode {
+ MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(NAME);
+ bit Store = 0;
+ bit Atomic = 0;
+ bit AtomicX2 = 0; // (f)cmpswap
+ bit Sampler = 0;
+ bits<8> NumExtraArgs = 0;
+ bit Gradients = 0;
+ bit Coordinates = 1;
+ bit LodOrClampOrMip = 0;
+ bit HasD16 = 0;
+}
+
+def MIMGBaseOpcode : GenericEnum {
+ let FilterClass = "MIMGBaseOpcode";
+}
+
+def MIMGBaseOpcodesTable : GenericTable {
+ let FilterClass = "MIMGBaseOpcode";
+ let CppTypeName = "MIMGBaseOpcodeInfo";
+ let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
+ "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip",
+ "HasD16"];
+ GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+
+ let PrimaryKey = ["BaseOpcode"];
+ let PrimaryKeyName = "getMIMGBaseOpcodeInfo";
+}
+
+def MIMGDim : GenericEnum {
+ let FilterClass = "AMDGPUDimProps";
+}
+
+def MIMGDimInfoTable : GenericTable {
+ let FilterClass = "AMDGPUDimProps";
+ let CppTypeName = "MIMGDimInfo";
+ let Fields = ["Dim", "NumCoords", "NumGradients", "DA"];
+ GenericEnum TypeOf_Dim = MIMGDim;
+
+ let PrimaryKey = ["Dim"];
+ let PrimaryKeyName = "getMIMGDimInfo";
}
class mimg <bits<7> si, bits<7> vi = si> {
@@ -17,254 +71,372 @@ class mimg <bits<7> si, bits<7> vi = si> {
field bits<7> VI = vi;
}
-class MIMG_Helper <dag outs, dag ins, string asm,
- string dns=""> : MIMG<outs, ins, asm,[]> {
+class MIMG <dag outs, string dns = "">
+ : InstSI <outs, (ins), "", []> {
+
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+ let MIMG = 1;
+ let Uses = [EXEC];
let mayLoad = 1;
let mayStore = 0;
let hasPostISelHook = 1;
+ let SchedRW = [WriteVMEM];
+ let UseNamedOperandTable = 1;
+ let hasSideEffects = 0; // XXX ????
+
+ let SubtargetPredicate = isGCN;
let DecoderNamespace = dns;
let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
let AsmMatchConverter = "cvtMIMG";
let usesCustomInserter = 1;
- let SchedRW = [WriteVMEM];
+
+ Instruction Opcode = !cast<Instruction>(NAME);
+ MIMGBaseOpcode BaseOpcode;
+ MIMGEncoding MIMGEncoding = MIMGEncGfx6;
+ bits<8> VDataDwords;
+ bits<8> VAddrDwords;
+}
+
+def MIMGInfoTable : GenericTable {
+ let FilterClass = "MIMG";
+ let CppTypeName = "MIMGInfo";
+ let Fields = ["Opcode", "BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
+ GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+ GenericEnum TypeOf_MIMGEncoding = MIMGEncoding;
+
+ let PrimaryKey = ["BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
+ let PrimaryKeyName = "getMIMGOpcodeHelper";
+}
+
+def getMIMGInfo : SearchIndex {
+ let Table = MIMGInfoTable;
+ let Key = ["Opcode"];
}
class MIMG_NoSampler_Helper <bits<7> op, string asm,
RegisterClass dst_rc,
RegisterClass addr_rc,
- string dns=""> : MIMG_Helper <
- (outs dst_rc:$vdata),
- (ins addr_rc:$vaddr, SReg_256:$srsrc,
- dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da",
- dns>, MIMGe<op> {
+ string dns="">
+ : MIMG <(outs dst_rc:$vdata), dns>,
+ MIMGe<op> {
let ssamp = 0;
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+
+ let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
+ DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ #!if(BaseOpcode.HasD16, "$d16", "");
}
multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- int channels> {
- def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
- !if(!eq(channels, 1), "AMDGPU", "")>,
- MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
- MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
- MIMG_Mask<asm#"_V4", channels>;
-}
+ RegisterClass dst_rc,
+ bit enableDisasm> {
+ let VAddrDwords = 1 in
+ def NAME # _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ let VAddrDwords = 2 in
+ def NAME # _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
+ let VAddrDwords = 3 in
+ def NAME # _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
+ let VAddrDwords = 4 in
+ def NAME # _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
+}
+
+multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
+ bit isResInfo = 0> {
+ def "" : MIMGBaseOpcode {
+ let Coordinates = !if(isResInfo, 0, 1);
+ let LodOrClampOrMip = mip;
+ let HasD16 = has_d16;
+ }
-multiclass MIMG_NoSampler <bits<7> op, string asm> {
- defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
- defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
- defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
- defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
+ mayLoad = !if(isResInfo, 0, 1) in {
+ let VDataDwords = 1 in
+ defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
+ let VDataDwords = 2 in
+ defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0>;
+ let VDataDwords = 3 in
+ defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
+ let VDataDwords = 4 in
+ defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
+ }
}
class MIMG_Store_Helper <bits<7> op, string asm,
RegisterClass data_rc,
RegisterClass addr_rc,
- string dns = ""> : MIMG_Helper <
- (outs),
- (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
- dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", dns>, MIMGe<op> {
+ string dns = "">
+ : MIMG <(outs), dns>,
+ MIMGe<op> {
let ssamp = 0;
- let mayLoad = 1; // TableGen requires this for matching with the intrinsics
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+
+ let mayLoad = 0;
let mayStore = 1;
- let hasSideEffects = 1;
+ let hasSideEffects = 0;
let hasPostISelHook = 0;
let DisableWQM = 1;
+
+ let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+ DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ #!if(BaseOpcode.HasD16, "$d16", "");
}
multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
RegisterClass data_rc,
- int channels> {
- def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
- !if(!eq(channels, 1), "AMDGPU", "")>,
- MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>,
- MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>,
- MIMG_Mask<asm#"_V4", channels>;
-}
+ bit enableDisasm> {
+ let VAddrDwords = 1 in
+ def NAME # _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ let VAddrDwords = 2 in
+ def NAME # _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
+ let VAddrDwords = 3 in
+ def NAME # _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
+ let VAddrDwords = 4 in
+ def NAME # _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
+}
+
+multiclass MIMG_Store <bits<7> op, string asm, bit has_d16, bit mip = 0> {
+ def "" : MIMGBaseOpcode {
+ let Store = 1;
+ let LodOrClampOrMip = mip;
+ let HasD16 = has_d16;
+ }
-multiclass MIMG_Store <bits<7> op, string asm> {
- defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
- defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>;
- defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>;
- defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>;
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
+ let VDataDwords = 1 in
+ defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
+ let VDataDwords = 2 in
+ defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 0>;
+ let VDataDwords = 3 in
+ defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 0>;
+ let VDataDwords = 4 in
+ defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 0>;
+ }
}
class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
- RegisterClass addr_rc> : MIMG_Helper <
- (outs data_rc:$vdst),
- (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
- dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
- > {
+ RegisterClass addr_rc, string dns="",
+ bit enableDasm = 0>
+ : MIMG <(outs data_rc:$vdst), !if(enableDasm, dns, "")> {
+ let mayLoad = 1;
let mayStore = 1;
- let hasSideEffects = 1;
+ let hasSideEffects = 1; // FIXME: Remove this
let hasPostISelHook = 0;
let DisableWQM = 1;
let Constraints = "$vdst = $vdata";
let AsmMatchConverter = "cvtMIMGAtomic";
-}
-class MIMG_Atomic_Real_si<mimg op, string name, string asm,
- RegisterClass data_rc, RegisterClass addr_rc> :
- MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
- SIMCInstr<name, SIEncodingFamily.SI>,
- MIMGe<op.SI> {
- let isCodeGenOnly = 0;
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class MIMG_Atomic_Real_vi<mimg op, string name, string asm,
- RegisterClass data_rc, RegisterClass addr_rc> :
- MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
- SIMCInstr<name, SIEncodingFamily.VI>,
- MIMGe<op.VI> {
- let isCodeGenOnly = 0;
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm,
- RegisterClass data_rc, RegisterClass addr_rc> {
- let isPseudo = 1, isCodeGenOnly = 1 in {
- def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
- SIMCInstr<name, SIEncodingFamily.NONE>;
+ let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+ DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
+ let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da";
+}
+
+multiclass MIMG_Atomic_Helper_m <mimg op, string asm, RegisterClass data_rc,
+ RegisterClass addr_rc, bit enableDasm = 0> {
+ let ssamp = 0, d16 = 0 in {
+ def _si : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "SICI", enableDasm>,
+ SIMCInstr<NAME, SIEncodingFamily.SI>,
+ MIMGe<op.SI> {
+ let AssemblerPredicates = [isSICI];
+ let DisableDecoder = DisableSIDecoder;
+ }
+
+ def _vi : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "VI", enableDasm>,
+ SIMCInstr<NAME, SIEncodingFamily.VI>,
+ MIMGe<op.VI> {
+ let AssemblerPredicates = [isVI];
+ let DisableDecoder = DisableVIDecoder;
+ let MIMGEncoding = MIMGEncGfx8;
+ }
}
+}
- let ssamp = 0 in {
- def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>;
+multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm,
+ RegisterClass data_rc,
+ bit enableDasm = 0> {
+ // _V* variants have different address size, but the size is not encoded.
+ // So only one variant can be disassembled. V1 looks the safest to decode.
+ let VAddrDwords = 1 in
+ defm _V1 : MIMG_Atomic_Helper_m <op, asm, data_rc, VGPR_32, enableDasm>;
+ let VAddrDwords = 2 in
+ defm _V2 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_64>;
+ let VAddrDwords = 3 in
+ defm _V3 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_96>;
+ let VAddrDwords = 4 in
+ defm _V4 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_128>;
+}
+
+multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atomics
+ def "" : MIMGBaseOpcode {
+ let Atomic = 1;
+ let AtomicX2 = isCmpSwap;
+ }
- def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>;
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
+ // _V* variants have different dst size, but the size is encoded implicitly,
+ // using dmask and tfe. Only 32-bit variant is registered with disassembler.
+ // Other variants are reconstructed by disassembler using dmask and tfe.
+ let VDataDwords = !if(isCmpSwap, 2, 1) in
+ defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1>;
+ let VDataDwords = !if(isCmpSwap, 4, 2) in
+ defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64)>;
}
}
-multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> {
- defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>;
- defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>;
- defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>;
+class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc,
+ RegisterClass src_rc, string dns="">
+ : MIMG <(outs dst_rc:$vdata), dns>,
+ MIMGe<op> {
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+
+ let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+ DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMGAddrSize<int dw, bit enable_disasm> {
+ int NumWords = dw;
+
+ RegisterClass RegClass = !if(!le(NumWords, 0), ?,
+ !if(!eq(NumWords, 1), VGPR_32,
+ !if(!eq(NumWords, 2), VReg_64,
+ !if(!eq(NumWords, 3), VReg_96,
+ !if(!eq(NumWords, 4), VReg_128,
+ !if(!le(NumWords, 8), VReg_256,
+ !if(!le(NumWords, 16), VReg_512, ?)))))));
+
+ // Whether the instruction variant with this vaddr size should be enabled for
+ // the auto-generated disassembler.
+ bit Disassemble = enable_disasm;
+}
+
+// Return whether a value inside the range [min, max] (endpoints inclusive)
+// is in the given list.
+class isRangeInList<int min, int max, list<int> lst> {
+ bit ret = !foldl(0, lst, lhs, y, !or(lhs, !and(!le(min, y), !le(y, max))));
+}
+
+class MIMGAddrSizes_tmp<list<MIMGAddrSize> lst, int min> {
+ list<MIMGAddrSize> List = lst;
+ int Min = min;
+}
+
+class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> {
+ // List of all possible numbers of address words, taking all combinations of
+ // A16 and image dimension into account (note: no MSAA, since this is for
+ // sample/gather ops).
+ list<int> AllNumAddrWords =
+ !foreach(dw, !if(sample.Gradients,
+ !if(!eq(sample.LodOrClamp, ""),
+ [2, 3, 4, 5, 6, 7, 9],
+ [2, 3, 4, 5, 7, 8, 10]),
+ !if(!eq(sample.LodOrClamp, ""),
+ [1, 2, 3],
+ [1, 2, 3, 4])),
+ !add(dw, !size(sample.ExtraAddrArgs)));
+
+ // Generate machine instructions based on possible register classes for the
+ // required numbers of address words. The disassembler defaults to the
+ // smallest register class.
+ list<MIMGAddrSize> MachineInstrs =
+ !foldl(MIMGAddrSizes_tmp<[], 0>, [1, 2, 3, 4, 8, 16], lhs, dw,
+ !if(isRangeInList<lhs.Min, dw, AllNumAddrWords>.ret,
+ MIMGAddrSizes_tmp<
+ !listconcat(lhs.List, [MIMGAddrSize<dw, !empty(lhs.List)>]),
+ !if(!eq(dw, 3), 3, !add(dw, 1))>, // we still need _V4 for codegen w/ 3 dwords
+ lhs)).List;
}
-class MIMG_Sampler_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- RegisterClass src_rc,
- bit wqm,
- string dns=""> : MIMG_Helper <
- (outs dst_rc:$vdata),
- (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
- dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
- dns>, MIMGe<op> {
- let WQM = wqm;
+multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+ AMDGPUSampleVariant sample, RegisterClass dst_rc,
+ bit enableDisasm = 0> {
+ foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in {
+ let VAddrDwords = addr.NumWords in
+ def _V # addr.NumWords
+ : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ }
}
-multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- int channels, bit wqm> {
- def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm,
- !if(!eq(channels, 1), "AMDGPU", "")>,
- MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
- MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>,
- MIMG_Mask<asm#"_V4", channels>;
- def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>,
- MIMG_Mask<asm#"_V8", channels>;
- def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>,
- MIMG_Mask<asm#"_V16", channels>;
-}
-
-multiclass MIMG_Sampler <bits<7> op, string asm, bit wqm=0> {
- defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>;
- defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>;
- defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>;
- defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>;
-}
-
-multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>;
-
-class MIMG_Gather_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- RegisterClass src_rc, bit wqm> : MIMG <
- (outs dst_rc:$vdata),
- (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
- dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
- []>, MIMGe<op> {
- let mayLoad = 1;
- let mayStore = 0;
+class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample>
+ : MIMGBaseOpcode {
+ let Sampler = 1;
+ let NumExtraArgs = !size(sample.ExtraAddrArgs);
+ let Gradients = sample.Gradients;
+ let LodOrClampOrMip = !ne(sample.LodOrClamp, "");
+}
- // DMASK was repurposed for GATHER4. 4 components are always
- // returned and DMASK works like a swizzle - it selects
- // the component to fetch. The only useful DMASK values are
- // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
- // (red,red,red,red) etc.) The ISA document doesn't mention
- // this.
- // Therefore, disable all code which updates DMASK by setting this:
- let Gather4 = 1;
- let hasPostISelHook = 0;
- let WQM = wqm;
+multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
+ bit isGetLod = 0,
+ string asm = "image_sample"#sample.LowerCaseMod> {
+ def "" : MIMG_Sampler_BaseOpcode<sample> {
+ let HasD16 = !if(isGetLod, 0, 1);
+ }
- let isAsmParserOnly = 1; // TBD: fix it later
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
+ mayLoad = !if(isGetLod, 0, 1) in {
+ let VDataDwords = 1 in
+ defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1>;
+ let VDataDwords = 2 in
+ defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>;
+ let VDataDwords = 3 in
+ defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
+ let VDataDwords = 4 in
+ defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
+ }
}
-multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- int channels, bit wqm> {
- def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>,
- MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>,
- MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>,
- MIMG_Mask<asm#"_V4", channels>;
- def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>,
- MIMG_Mask<asm#"_V8", channels>;
- def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>,
- MIMG_Mask<asm#"_V16", channels>;
-}
+multiclass MIMG_Sampler_WQM <bits<7> op, AMDGPUSampleVariant sample>
+ : MIMG_Sampler<op, sample, 1>;
-multiclass MIMG_Gather <bits<7> op, string asm, bit wqm=0> {
- defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>;
- defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>;
- defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>;
- defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>;
+multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
+ string asm = "image_gather4"#sample.LowerCaseMod> {
+ def "" : MIMG_Sampler_BaseOpcode<sample> {
+ let HasD16 = 1;
+ }
+
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
+ Gather4 = 1, hasPostISelHook = 0 in {
+ let VDataDwords = 2 in
+ defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
+ let VDataDwords = 4 in
+ defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
+ }
}
-multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>;
+multiclass MIMG_Gather_WQM <bits<7> op, AMDGPUSampleVariant sample>
+ : MIMG_Gather<op, sample, 1>;
//===----------------------------------------------------------------------===//
// MIMG Instructions
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN in {
-defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">;
-defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
-//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>;
-//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
-//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
-//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
-defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">;
-defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">;
-//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
-//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
-
-let mayLoad = 0, mayStore = 0 in {
-defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
-}
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load", 1>;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip", 1, 1>;
+defm IMAGE_LOAD_PCK : MIMG_NoSampler <0x00000002, "image_load_pck", 0>;
+defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <0x00000003, "image_load_pck_sgn", 0>;
+defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <0x00000004, "image_load_mip_pck", 0, 1>;
+defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <0x00000005, "image_load_mip_pck_sgn", 0, 1>;
+defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store", 1>;
+defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip", 1, 1>;
+defm IMAGE_STORE_PCK : MIMG_Store <0x0000000a, "image_store_pck", 0>;
+defm IMAGE_STORE_MIP_PCK : MIMG_Store <0x0000000b, "image_store_mip_pck", 0, 1>;
+
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo", 0, 1, 1>;
defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
-defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>;
+defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", 1>;
defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">;
//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI
@@ -277,397 +449,101 @@ defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
-//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d, 1>; -- not on VI
//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
-defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">;
-defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
-defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">;
-defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
-defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">;
-defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">;
-defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">;
-defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">;
-defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">;
-defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">;
-defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
-defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
-defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
-defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">;
-defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">;
-defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
-defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">;
-defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">;
-defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">;
-defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
-defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">;
-defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">;
-defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">;
-defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
-defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">;
-defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">;
-defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
-defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
-defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
-defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">;
-defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">;
-defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
-defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">;
-defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">;
-defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">;
-defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">;
-defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">;
-defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">;
-defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">;
-defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">;
-defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
-defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">;
-defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">;
-defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
-defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">;
-defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">;
-defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">;
-defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">;
-defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
-defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
-defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">;
-defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">;
-defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
-defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
-defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
-
-let mayLoad = 0, mayStore = 0 in {
-defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
-}
-
-defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">;
-defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
-defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
-defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">;
-defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">;
-defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">;
-defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">;
+defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
+defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
+defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>;
+defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>;
+defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>;
+defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>;
+defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>;
+defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>;
+defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>;
+defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>;
+defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>;
+defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>;
+defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>;
+defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>;
+defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>;
+defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>;
+defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>;
+defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>;
+defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>;
+defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>;
+defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>;
+defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>;
+defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>;
+defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>;
+defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>;
+defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>;
+defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>;
+defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>;
+defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>;
+defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>;
+defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>;
+defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>;
+defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>;
+defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>;
+defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>;
+defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>;
+defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>;
+defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>;
+defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>;
+defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>;
+defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>;
+defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>;
+defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>;
+defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>;
+defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>;
+defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>;
+defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>;
+defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>;
+
+defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 1, "image_get_lod">;
+
+defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>;
+defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>;
+defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>;
+defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>;
+defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>;
+defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>;
+defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>;
//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
+
+/********** ========================================= **********/
+/********** Table of dimension-aware image intrinsics **********/
+/********** ========================================= **********/
+
+class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
+ Intrinsic Intr = I;
+ MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
+ AMDGPUDimProps Dim = I.P.Dim;
}
-/********** ======================= **********/
-/********** Image sampling patterns **********/
-/********** ======================= **********/
-
-// Image + sampler
-class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
- i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
- (opcode $addr, $rsrc, $sampler,
- (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
- (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
->;
-
-multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>;
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>;
-}
-
-// Image + sampler for amdgcn
-// TODO:
-// 1. Handle half data type like v4f16, and add D16 bit support;
-// 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128).
-// 3. Add A16 support when we pass address of half type.
-multiclass AMDGCNSamplePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
- def : GCNPat<
- (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc,
- i1:$slc, i1:$lwe, i1:$da)),
- (opcode $addr, $rsrc, $sampler,
- (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
- 0, 0, (as_i1imm $lwe), (as_i1imm $da))
- >;
-}
-
-multiclass AMDGCNSampleDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
- defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V1), dt, f32>;
- defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V2), dt, v2f32>;
- defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V4), dt, v4f32>;
- defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V8), dt, v8f32>;
- defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V16), dt, v16f32>;
-}
-
-// TODO: support v3f32.
-multiclass AMDGCNSamplePatterns<SDPatternOperator name, string opcode> {
- defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V1), f32>;
- defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
- defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
-}
-
-// Image only
-class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm,
- imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe),
- (opcode $addr, $rsrc,
- (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
- (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
->;
-
-multiclass ImagePatterns<SDPatternOperator name, string opcode> {
- def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
- def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
- def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
-}
-
-multiclass ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
- def : GCNPat <
- (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe,
- i1:$da)),
- (opcode $addr, $rsrc,
- (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
- 0, 0, (as_i1imm $lwe), (as_i1imm $da))
- >;
-}
-
-multiclass ImageLoadDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
- defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
- defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
- defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
-}
-
-// TODO: support v3f32.
-multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> {
- defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f32>;
- defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
- defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
-}
-
-multiclass ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
- def : GCNPat <
- (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc,
- i1:$lwe, i1:$da),
- (opcode $data, $addr, $rsrc,
- (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
- 0, 0, (as_i1imm $lwe), (as_i1imm $da))
- >;
-}
-
-multiclass ImageStoreDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
- defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
- defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
- defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
-}
-
-// TODO: support v3f32.
-multiclass ImageStorePatterns<SDPatternOperator name, string opcode> {
- defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), f32>;
- defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
- defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
-}
-
-class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
- (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc),
- (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da))
->;
-
-multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> {
- def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>;
- def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>;
- def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>;
-}
-
-class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : GCNPat <
- (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc,
- imm:$r128, imm:$da, imm:$slc),
- (EXTRACT_SUBREG
- (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1),
- $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)),
- sub0)
->;
-
-// ======= amdgcn Image Intrinsics ==============
-
-// Image load
-defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">;
-defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">;
-defm : ImageLoadPatterns<int_amdgcn_image_getresinfo, "IMAGE_GET_RESINFO">;
-
-// Image store
-defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">;
-defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">;
-
-// Basic sample
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample, "IMAGE_SAMPLE">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl, "IMAGE_SAMPLE_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d, "IMAGE_SAMPLE_D">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l, "IMAGE_SAMPLE_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b, "IMAGE_SAMPLE_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz, "IMAGE_SAMPLE_LZ">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd, "IMAGE_SAMPLE_CD">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">;
-
-// Sample with comparison
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c, "IMAGE_SAMPLE_C">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d, "IMAGE_SAMPLE_C_D">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l, "IMAGE_SAMPLE_C_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b, "IMAGE_SAMPLE_C_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">;
-
-// Sample with offsets
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_o, "IMAGE_SAMPLE_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_o, "IMAGE_SAMPLE_D_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l_o, "IMAGE_SAMPLE_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_o, "IMAGE_SAMPLE_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">;
-
-// Sample with comparison and offsets
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_o, "IMAGE_SAMPLE_C_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
-
-// Gather opcodes
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4, "IMAGE_GATHER4">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl, "IMAGE_GATHER4_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l, "IMAGE_GATHER4_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b, "IMAGE_GATHER4_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl, "IMAGE_GATHER4_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz, "IMAGE_GATHER4_LZ">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c, "IMAGE_GATHER4_C">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl, "IMAGE_GATHER4_C_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l, "IMAGE_GATHER4_C_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b, "IMAGE_GATHER4_C_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl, "IMAGE_GATHER4_C_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz, "IMAGE_GATHER4_C_LZ">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_o, "IMAGE_GATHER4_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl_o, "IMAGE_GATHER4_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l_o, "IMAGE_GATHER4_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_o, "IMAGE_GATHER4_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl_o, "IMAGE_GATHER4_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz_o, "IMAGE_GATHER4_LZ_O">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_o, "IMAGE_GATHER4_C_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl_o, "IMAGE_GATHER4_C_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l_o, "IMAGE_GATHER4_C_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_o, "IMAGE_GATHER4_C_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl_o, "IMAGE_GATHER4_C_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz_o, "IMAGE_GATHER4_C_LZ_O">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_getlod, "IMAGE_GET_LOD">;
-
-// Image atomics
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">;
-
-/* SIsample for simple 1D texture lookup */
-def : GCNPat <
- (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
- (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
->;
-
-class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
- (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
->;
-
-class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT),
- (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0)
->;
-
-class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY),
- (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
->;
-
-class SampleShadowPattern<SDNode name, MIMG opcode,
- ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW),
- (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
->;
-
-class SampleShadowArrayPattern<SDNode name, MIMG opcode,
- ValueType vt> : GCNPat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
- (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
->;
-
-/* SIsample* for texture lookups consuming more address parameters */
-multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
- MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
-MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
- def : SamplePattern <SIsample, sample, addr_type>;
- def : SampleRectPattern <SIsample, sample, addr_type>;
- def : SampleArrayPattern <SIsample, sample, addr_type>;
- def : SampleShadowPattern <SIsample, sample_c, addr_type>;
- def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
-
- def : SamplePattern <SIsamplel, sample_l, addr_type>;
- def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
- def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
- def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
-
- def : SamplePattern <SIsampleb, sample_b, addr_type>;
- def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
- def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
- def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
-
- def : SamplePattern <SIsampled, sample_d, addr_type>;
- def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
- def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
- def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
-}
-
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
- IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
- IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
- IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
- v2i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
- IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
- IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
- IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
- v4i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
- IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
- IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
- IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
- v8i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
- IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
- IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
- IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
- v16i32>;
+def ImageDimIntrinsicTable : GenericTable {
+ let FilterClass = "ImageDimIntrinsicInfo";
+ let Fields = ["Intr", "BaseOpcode", "Dim"];
+ GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+ GenericEnum TypeOf_Dim = MIMGDim;
+
+ let PrimaryKey = ["Intr"];
+ let PrimaryKeyName = "getImageDimIntrinsicInfo";
+ let PrimaryKeyEarlyOut = 1;
+}
+
+foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
+ AMDGPUImageDimAtomicIntrinsics) in {
+ def : ImageDimIntrinsicInfo<intr>;
+}
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
deleted file mode 100644
index d50dae78e247..000000000000
--- a/lib/Target/AMDGPU/Processors.td
+++ /dev/null
@@ -1,12 +0,0 @@
-//===-- Processors.td - AMDGPU Processor definitions ----------------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-FIXME: Deleting this file broke buildbots that don't do full rebuilds. This
-file is no longer used by the backend, so it can be deleted once all
-the buildbots update there dependencies.
diff --git a/lib/Target/AMDGPU/R600.td b/lib/Target/AMDGPU/R600.td
new file mode 100644
index 000000000000..5c9c1c1ed504
--- /dev/null
+++ b/lib/Target/AMDGPU/R600.td
@@ -0,0 +1,54 @@
+//===-- R600.td - R600 Tablegen files ----------------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+def R600InstrInfo : InstrInfo {
+ let guessInstructionProperties = 1;
+ let noNamedPositionallyEncodedOperands = 1;
+}
+
+def R600 : Target {
+ let InstructionSet = R600InstrInfo;
+ let AllowRegisterRenaming = 1;
+}
+
+let Namespace = "R600" in {
+
+foreach Index = 0-15 in {
+ def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
+}
+
+include "R600RegisterInfo.td"
+
+}
+
+def NullALU : InstrItinClass;
+def ALU_NULL : FuncUnit;
+
+include "AMDGPUFeatures.td"
+include "R600Schedule.td"
+include "R600Processors.td"
+include "AMDGPUInstrInfo.td"
+include "AMDGPUInstructions.td"
+include "R600Instructions.td"
+include "R700Instructions.td"
+include "EvergreenInstructions.td"
+include "CaymanInstructions.td"
+
+// Calling convention for R600
+def CC_R600 : CallingConv<[
+ CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
+ T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
+ T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
+ T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
+ T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
+ T30_XYZW, T31_XYZW, T32_XYZW
+ ]>>>
+]>;
diff --git a/lib/Target/AMDGPU/R600AsmPrinter.cpp b/lib/Target/AMDGPU/R600AsmPrinter.cpp
new file mode 100644
index 000000000000..68f8c30775b8
--- /dev/null
+++ b/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -0,0 +1,133 @@
+//===-- R600AsmPrinter.cpp - R600 Assebly printer ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// The R600AsmPrinter is used to print both assembly string and also binary
+/// code. When passed an MCAsmStreamer it prints assembly and when passed
+/// an MCObjectStreamer it outputs binary code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600AsmPrinter.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+AsmPrinter *
+llvm::createR600AsmPrinterPass(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> &&Streamer) {
+ return new R600AsmPrinter(TM, std::move(Streamer));
+}
+
+R600AsmPrinter::R600AsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)) { }
+
+StringRef R600AsmPrinter::getPassName() const {
+ return "R600 Assembly Printer";
+}
+
+void R600AsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
+ unsigned MaxGPR = 0;
+ bool killPixel = false;
+ const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
+ const R600RegisterInfo *RI = STM.getRegisterInfo();
+ const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ if (MI.getOpcode() == R600::KILLGT)
+ killPixel = true;
+ unsigned numOperands = MI.getNumOperands();
+ for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+ const MachineOperand &MO = MI.getOperand(op_idx);
+ if (!MO.isReg())
+ continue;
+ unsigned HWReg = RI->getHWRegIndex(MO.getReg());
+
+ // Register with value > 127 aren't GPR
+ if (HWReg > 127)
+ continue;
+ MaxGPR = std::max(MaxGPR, HWReg);
+ }
+ }
+ }
+
+ unsigned RsrcReg;
+ if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
+ // Evergreen / Northern Islands
+ switch (MF.getFunction().getCallingConv()) {
+ default: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
+ case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
+ case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
+ case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
+ }
+ } else {
+ // R600 / R700
+ switch (MF.getFunction().getCallingConv()) {
+ default: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
+ case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
+ }
+ }
+
+ OutStreamer->EmitIntValue(RsrcReg, 4);
+ OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
+ S_STACK_SIZE(MFI->CFStackSize), 4);
+ OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
+ OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+
+ if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
+ OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
+ OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
+ }
+}
+
+bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+
+
+ // Functions needs to be cacheline (256B) aligned.
+ MF.ensureAlignment(8);
+
+ SetupMachineFunction(MF);
+
+ MCContext &Context = getObjFileLowering().getContext();
+ MCSectionELF *ConfigSection =
+ Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
+ OutStreamer->SwitchSection(ConfigSection);
+
+ EmitProgramInfoR600(MF);
+
+ EmitFunctionBody();
+
+ if (isVerbose()) {
+ MCSectionELF *CommentSection =
+ Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
+ OutStreamer->SwitchSection(CommentSection);
+
+ R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+ OutStreamer->emitRawComment(
+ Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize)));
+ }
+
+ return false;
+}
+
diff --git a/lib/Target/AMDGPU/R600AsmPrinter.h b/lib/Target/AMDGPU/R600AsmPrinter.h
new file mode 100644
index 000000000000..079fc707b03c
--- /dev/null
+++ b/lib/Target/AMDGPU/R600AsmPrinter.h
@@ -0,0 +1,46 @@
+//===-- R600AsmPrinter.h - Print R600 assembly code -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// R600 Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H
+
+#include "llvm/CodeGen/AsmPrinter.h"
+
+namespace llvm {
+
+class R600AsmPrinter final : public AsmPrinter {
+
+public:
+ explicit R600AsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer);
+ StringRef getPassName() const override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ /// Implemented in AMDGPUMCInstLower.cpp
+ void EmitInstruction(const MachineInstr *MI) override;
+ /// Lower the specified LLVM Constant to an MCExpr.
+ /// The AsmPrinter::lowerConstantof does not know how to lower
+ /// addrspacecast, therefore they should be lowered by this function.
+ const MCExpr *lowerConstant(const Constant *CV) override;
+
+private:
+ void EmitProgramInfoR600(const MachineFunction &MF);
+};
+
+AsmPrinter *
+createR600AsmPrinterPass(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> &&Streamer);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H
diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 5e1ba6b506da..0c62d6a4b3d9 100644
--- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -19,6 +19,7 @@
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
#include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -33,8 +34,8 @@ namespace {
static bool isCFAlu(const MachineInstr &MI) {
switch (MI.getOpcode()) {
- case AMDGPU::CF_ALU:
- case AMDGPU::CF_ALU_PUSH_BEFORE:
+ case R600::CF_ALU:
+ case R600::CF_ALU_PUSH_BEFORE:
return true;
default:
return false;
@@ -84,20 +85,20 @@ char &llvm::R600ClauseMergePassID = R600ClauseMergePass::ID;
unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const {
assert(isCFAlu(MI));
return MI
- .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::COUNT))
+ .getOperand(TII->getOperandIdx(MI.getOpcode(), R600::OpName::COUNT))
.getImm();
}
bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr &MI) const {
assert(isCFAlu(MI));
return MI
- .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::Enabled))
+ .getOperand(TII->getOperandIdx(MI.getOpcode(), R600::OpName::Enabled))
.getImm();
}
void R600ClauseMergePass::cleanPotentialDisabledCFAlu(
MachineInstr &CFAlu) const {
- int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+ int CntIdx = TII->getOperandIdx(R600::CF_ALU, R600::OpName::COUNT);
MachineBasicBlock::iterator I = CFAlu, E = CFAlu.getParent()->end();
I++;
do {
@@ -116,46 +117,46 @@ void R600ClauseMergePass::cleanPotentialDisabledCFAlu(
bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu,
const MachineInstr &LatrCFAlu) const {
assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
- int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+ int CntIdx = TII->getOperandIdx(R600::CF_ALU, R600::OpName::COUNT);
unsigned RootInstCount = getCFAluSize(RootCFAlu),
LaterInstCount = getCFAluSize(LatrCFAlu);
unsigned CumuledInsts = RootInstCount + LaterInstCount;
if (CumuledInsts >= TII->getMaxAlusPerClause()) {
- DEBUG(dbgs() << "Excess inst counts\n");
+ LLVM_DEBUG(dbgs() << "Excess inst counts\n");
return false;
}
- if (RootCFAlu.getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+ if (RootCFAlu.getOpcode() == R600::CF_ALU_PUSH_BEFORE)
return false;
// Is KCache Bank 0 compatible ?
int Mode0Idx =
- TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0);
+ TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_MODE0);
int KBank0Idx =
- TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
+ TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_BANK0);
int KBank0LineIdx =
- TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
+ TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_ADDR0);
if (LatrCFAlu.getOperand(Mode0Idx).getImm() &&
RootCFAlu.getOperand(Mode0Idx).getImm() &&
(LatrCFAlu.getOperand(KBank0Idx).getImm() !=
RootCFAlu.getOperand(KBank0Idx).getImm() ||
LatrCFAlu.getOperand(KBank0LineIdx).getImm() !=
RootCFAlu.getOperand(KBank0LineIdx).getImm())) {
- DEBUG(dbgs() << "Wrong KC0\n");
+ LLVM_DEBUG(dbgs() << "Wrong KC0\n");
return false;
}
// Is KCache Bank 1 compatible ?
int Mode1Idx =
- TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1);
+ TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_MODE1);
int KBank1Idx =
- TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
+ TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_BANK1);
int KBank1LineIdx =
- TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
+ TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_ADDR1);
if (LatrCFAlu.getOperand(Mode1Idx).getImm() &&
RootCFAlu.getOperand(Mode1Idx).getImm() &&
(LatrCFAlu.getOperand(KBank1Idx).getImm() !=
RootCFAlu.getOperand(KBank1Idx).getImm() ||
LatrCFAlu.getOperand(KBank1LineIdx).getImm() !=
RootCFAlu.getOperand(KBank1LineIdx).getImm())) {
- DEBUG(dbgs() << "Wrong KC0\n");
+ LLVM_DEBUG(dbgs() << "Wrong KC0\n");
return false;
}
if (LatrCFAlu.getOperand(Mode0Idx).getImm()) {
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 0e788df1c9c0..a19020276f35 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -19,6 +19,7 @@
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
#include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
@@ -93,7 +94,7 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) {
}
bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
- if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
+ if (Opcode == R600::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
getLoopDepth() > 1)
return true;
@@ -102,10 +103,10 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
switch(Opcode) {
default: return false;
- case AMDGPU::CF_ALU_PUSH_BEFORE:
- case AMDGPU::CF_ALU_ELSE_AFTER:
- case AMDGPU::CF_ALU_BREAK:
- case AMDGPU::CF_ALU_CONTINUE:
+ case R600::CF_ALU_PUSH_BEFORE:
+ case R600::CF_ALU_ELSE_AFTER:
+ case R600::CF_ALU_BREAK:
+ case R600::CF_ALU_CONTINUE:
if (CurrentSubEntries == 0)
return false;
if (ST->getWavefrontSize() == 64) {
@@ -136,7 +137,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
return 0;
case CFStack::FIRST_NON_WQM_PUSH:
assert(!ST->hasCaymanISA());
- if (ST->getGeneration() <= R600Subtarget::R700) {
+ if (ST->getGeneration() <= AMDGPUSubtarget::R700) {
// +1 For the push operation.
// +2 Extra space required.
return 3;
@@ -149,7 +150,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
return 2;
}
case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
- assert(ST->getGeneration() >= R600Subtarget::EVERGREEN);
+ assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
// +1 For the push operation.
// +1 Extra space required.
return 2;
@@ -167,8 +168,8 @@ void CFStack::updateMaxStackSize() {
void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
CFStack::StackItem Item = CFStack::ENTRY;
switch(Opcode) {
- case AMDGPU::CF_PUSH_EG:
- case AMDGPU::CF_ALU_PUSH_BEFORE:
+ case R600::CF_PUSH_EG:
+ case R600::CF_ALU_PUSH_BEFORE:
if (!isWQM) {
if (!ST->hasCaymanISA() &&
!branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
@@ -176,7 +177,7 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
// See comment in
// CFStack::getSubEntrySize()
else if (CurrentEntries > 0 &&
- ST->getGeneration() > R600Subtarget::EVERGREEN &&
+ ST->getGeneration() > AMDGPUSubtarget::EVERGREEN &&
!ST->hasCaymanISA() &&
!branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
@@ -239,8 +240,8 @@ private:
bool IsTrivialInst(MachineInstr &MI) const {
switch (MI.getOpcode()) {
- case AMDGPU::KILL:
- case AMDGPU::RETURN:
+ case R600::KILL:
+ case R600::RETURN:
return true;
default:
return false;
@@ -249,44 +250,44 @@ private:
const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
unsigned Opcode = 0;
- bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN);
+ bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
switch (CFI) {
case CF_TC:
- Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
+ Opcode = isEg ? R600::CF_TC_EG : R600::CF_TC_R600;
break;
case CF_VC:
- Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
+ Opcode = isEg ? R600::CF_VC_EG : R600::CF_VC_R600;
break;
case CF_CALL_FS:
- Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
+ Opcode = isEg ? R600::CF_CALL_FS_EG : R600::CF_CALL_FS_R600;
break;
case CF_WHILE_LOOP:
- Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
+ Opcode = isEg ? R600::WHILE_LOOP_EG : R600::WHILE_LOOP_R600;
break;
case CF_END_LOOP:
- Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
+ Opcode = isEg ? R600::END_LOOP_EG : R600::END_LOOP_R600;
break;
case CF_LOOP_BREAK:
- Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
+ Opcode = isEg ? R600::LOOP_BREAK_EG : R600::LOOP_BREAK_R600;
break;
case CF_LOOP_CONTINUE:
- Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
+ Opcode = isEg ? R600::CF_CONTINUE_EG : R600::CF_CONTINUE_R600;
break;
case CF_JUMP:
- Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
+ Opcode = isEg ? R600::CF_JUMP_EG : R600::CF_JUMP_R600;
break;
case CF_ELSE:
- Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
+ Opcode = isEg ? R600::CF_ELSE_EG : R600::CF_ELSE_R600;
break;
case CF_POP:
- Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
+ Opcode = isEg ? R600::POP_EG : R600::POP_R600;
break;
case CF_END:
if (ST->hasCaymanISA()) {
- Opcode = AMDGPU::CF_END_CM;
+ Opcode = R600::CF_END_CM;
break;
}
- Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
+ Opcode = isEg ? R600::CF_END_EG : R600::CF_END_R600;
break;
}
assert (Opcode && "No opcode selected");
@@ -304,21 +305,21 @@ private:
continue;
if (MO.isDef()) {
unsigned Reg = MO.getReg();
- if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+ if (R600::R600_Reg128RegClass.contains(Reg))
DstMI = Reg;
else
DstMI = TRI->getMatchingSuperReg(Reg,
- TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
- &AMDGPU::R600_Reg128RegClass);
+ AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+ &R600::R600_Reg128RegClass);
}
if (MO.isUse()) {
unsigned Reg = MO.getReg();
- if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+ if (R600::R600_Reg128RegClass.contains(Reg))
SrcMI = Reg;
else
SrcMI = TRI->getMatchingSuperReg(Reg,
- TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
- &AMDGPU::R600_Reg128RegClass);
+ AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+ &R600::R600_Reg128RegClass);
}
}
if ((DstRegs.find(SrcMI) == DstRegs.end())) {
@@ -358,15 +359,15 @@ private:
void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const {
static const unsigned LiteralRegs[] = {
- AMDGPU::ALU_LITERAL_X,
- AMDGPU::ALU_LITERAL_Y,
- AMDGPU::ALU_LITERAL_Z,
- AMDGPU::ALU_LITERAL_W
+ R600::ALU_LITERAL_X,
+ R600::ALU_LITERAL_Y,
+ R600::ALU_LITERAL_Z,
+ R600::ALU_LITERAL_W
};
const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs =
TII->getSrcs(MI);
for (const auto &Src:Srcs) {
- if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
+ if (Src.first->getReg() != R600::ALU_LITERAL_X)
continue;
int64_t Imm = Src.second;
std::vector<MachineOperand *>::iterator It =
@@ -376,7 +377,7 @@ private:
// Get corresponding Operand
MachineOperand &Operand = MI.getOperand(
- TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+ TII->getOperandIdx(MI.getOpcode(), R600::OpName::literal));
if (It != Lits.end()) {
// Reuse existing literal reg
@@ -399,7 +400,7 @@ private:
unsigned LiteralPair0 = Literals[i];
unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
- TII->get(AMDGPU::LITERALS))
+ TII->get(R600::LITERALS))
.addImm(LiteralPair0)
.addImm(LiteralPair1);
}
@@ -441,7 +442,7 @@ private:
}
for (unsigned i = 0, e = Literals.size(); i < e; i += 2) {
MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(),
- TII->get(AMDGPU::LITERALS));
+ TII->get(R600::LITERALS));
if (Literals[i]->isImm()) {
MILit.addImm(Literals[i]->getImm());
} else {
@@ -470,7 +471,7 @@ private:
unsigned &CfCount) {
CounterPropagateAddr(*Clause.first, CfCount);
MachineBasicBlock *BB = Clause.first->getParent();
- BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount);
+ BuildMI(BB, DL, TII->get(R600::FETCH_CLAUSE)).addImm(CfCount);
for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
BB->splice(InsertPos, BB, Clause.second[i]);
}
@@ -482,7 +483,7 @@ private:
Clause.first->getOperand(0).setImm(0);
CounterPropagateAddr(*Clause.first, CfCount);
MachineBasicBlock *BB = Clause.first->getParent();
- BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount);
+ BuildMI(BB, DL, TII->get(R600::ALU_CLAUSE)).addImm(CfCount);
for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
BB->splice(InsertPos, BB, Clause.second[i]);
}
@@ -531,7 +532,7 @@ public:
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E;) {
if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
- DEBUG(dbgs() << CfCount << ":"; I->dump(););
+ LLVM_DEBUG(dbgs() << CfCount << ":"; I->dump(););
FetchClauses.push_back(MakeFetchClause(MBB, I));
CfCount++;
LastAlu.back() = nullptr;
@@ -539,33 +540,34 @@ public:
}
MachineBasicBlock::iterator MI = I;
- if (MI->getOpcode() != AMDGPU::ENDIF)
+ if (MI->getOpcode() != R600::ENDIF)
LastAlu.back() = nullptr;
- if (MI->getOpcode() == AMDGPU::CF_ALU)
+ if (MI->getOpcode() == R600::CF_ALU)
LastAlu.back() = &*MI;
I++;
bool RequiresWorkAround =
CFStack.requiresWorkAroundForInst(MI->getOpcode());
switch (MI->getOpcode()) {
- case AMDGPU::CF_ALU_PUSH_BEFORE:
+ case R600::CF_ALU_PUSH_BEFORE:
if (RequiresWorkAround) {
- DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
- BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
+ LLVM_DEBUG(dbgs()
+ << "Applying bug work-around for ALU_PUSH_BEFORE\n");
+ BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(R600::CF_PUSH_EG))
.addImm(CfCount + 1)
.addImm(1);
- MI->setDesc(TII->get(AMDGPU::CF_ALU));
+ MI->setDesc(TII->get(R600::CF_ALU));
CfCount++;
- CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
+ CFStack.pushBranch(R600::CF_PUSH_EG);
} else
- CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
+ CFStack.pushBranch(R600::CF_ALU_PUSH_BEFORE);
LLVM_FALLTHROUGH;
- case AMDGPU::CF_ALU:
+ case R600::CF_ALU:
I = MI;
AluClauses.push_back(MakeALUClause(MBB, I));
- DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+ LLVM_DEBUG(dbgs() << CfCount << ":"; MI->dump(););
CfCount++;
break;
- case AMDGPU::WHILELOOP: {
+ case R600::WHILELOOP: {
CFStack.pushLoop();
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
getHWInstrDesc(CF_WHILE_LOOP))
@@ -578,7 +580,7 @@ public:
CfCount++;
break;
}
- case AMDGPU::ENDLOOP: {
+ case R600::ENDLOOP: {
CFStack.popLoop();
std::pair<unsigned, std::set<MachineInstr *>> Pair =
std::move(LoopStack.back());
@@ -590,19 +592,19 @@ public:
CfCount++;
break;
}
- case AMDGPU::IF_PREDICATE_SET: {
+ case R600::IF_PREDICATE_SET: {
LastAlu.push_back(nullptr);
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
getHWInstrDesc(CF_JUMP))
.addImm(0)
.addImm(0);
IfThenElseStack.push_back(MIb);
- DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+ LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
MI->eraseFromParent();
CfCount++;
break;
}
- case AMDGPU::ELSE: {
+ case R600::ELSE: {
MachineInstr * JumpInst = IfThenElseStack.back();
IfThenElseStack.pop_back();
CounterPropagateAddr(*JumpInst, CfCount);
@@ -610,13 +612,13 @@ public:
getHWInstrDesc(CF_ELSE))
.addImm(0)
.addImm(0);
- DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+ LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
IfThenElseStack.push_back(MIb);
MI->eraseFromParent();
CfCount++;
break;
}
- case AMDGPU::ENDIF: {
+ case R600::ENDIF: {
CFStack.popBranch();
if (LastAlu.back()) {
ToPopAfter.push_back(LastAlu.back());
@@ -626,7 +628,7 @@ public:
.addImm(CfCount + 1)
.addImm(1);
(void)MIb;
- DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+ LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
CfCount++;
}
@@ -638,7 +640,7 @@ public:
MI->eraseFromParent();
break;
}
- case AMDGPU::BREAK: {
+ case R600::BREAK: {
CfCount ++;
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
getHWInstrDesc(CF_LOOP_BREAK))
@@ -647,7 +649,7 @@ public:
MI->eraseFromParent();
break;
}
- case AMDGPU::CONTINUE: {
+ case R600::CONTINUE: {
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
getHWInstrDesc(CF_LOOP_CONTINUE))
.addImm(0);
@@ -656,12 +658,12 @@ public:
CfCount++;
break;
}
- case AMDGPU::RETURN: {
+ case R600::RETURN: {
DebugLoc DL = MBB.findDebugLoc(MI);
BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END));
CfCount++;
if (CfCount % 2) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD));
+ BuildMI(MBB, I, DL, TII->get(R600::PAD));
CfCount++;
}
MI->eraseFromParent();
@@ -673,7 +675,7 @@ public:
}
default:
if (TII->isExport(MI->getOpcode())) {
- DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+ LLVM_DEBUG(dbgs() << CfCount << ":"; MI->dump(););
CfCount++;
}
break;
@@ -682,7 +684,7 @@ public:
for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
MachineInstr *Alu = ToPopAfter[i];
BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
- TII->get(AMDGPU::CF_ALU_POP_AFTER))
+ TII->get(R600::CF_ALU_POP_AFTER))
.addImm(Alu->getOperand(0).getImm())
.addImm(Alu->getOperand(1).getImm())
.addImm(Alu->getOperand(2).getImm())
diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h
index 534461adc59f..0d33d82e8e0f 100644
--- a/lib/Target/AMDGPU/R600Defines.h
+++ b/lib/Target/AMDGPU/R600Defines.h
@@ -23,7 +23,7 @@
#define MO_FLAG_LAST (1 << 6)
#define NUM_MO_FLAGS 7
-/// \brief Helper for getting the operand index for the instruction flags
+/// Helper for getting the operand index for the instruction flags
/// operand.
#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
@@ -52,7 +52,7 @@ namespace R600_InstFlag {
#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
-/// \brief Defines for extracting register information from register encoding
+/// Defines for extracting register information from register encoding
#define HW_REG_MASK 0x1ff
#define HW_CHAN_SHIFT 9
diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index 0d8ccd088ec4..1683fe6c9a57 100644
--- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -19,6 +19,7 @@
#include "R600Defines.h"
#include "R600InstrInfo.h"
#include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -51,12 +52,12 @@ private:
unsigned OccupiedDwords(MachineInstr &MI) const {
switch (MI.getOpcode()) {
- case AMDGPU::INTERP_PAIR_XY:
- case AMDGPU::INTERP_PAIR_ZW:
- case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::DOT_4:
+ case R600::INTERP_PAIR_XY:
+ case R600::INTERP_PAIR_ZW:
+ case R600::INTERP_VEC_LOAD:
+ case R600::DOT_4:
return 4;
- case AMDGPU::KILL:
+ case R600::KILL:
return 0;
default:
break;
@@ -76,7 +77,7 @@ private:
E = MI.operands_end();
It != E; ++It) {
MachineOperand &MO = *It;
- if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+ if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
++NumLiteral;
}
return 1 + NumLiteral;
@@ -88,12 +89,12 @@ private:
if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()))
return true;
switch (MI.getOpcode()) {
- case AMDGPU::PRED_X:
- case AMDGPU::INTERP_PAIR_XY:
- case AMDGPU::INTERP_PAIR_ZW:
- case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::COPY:
- case AMDGPU::DOT_4:
+ case R600::PRED_X:
+ case R600::INTERP_PAIR_XY:
+ case R600::INTERP_PAIR_ZW:
+ case R600::INTERP_VEC_LOAD:
+ case R600::COPY:
+ case R600::DOT_4:
return true;
default:
return false;
@@ -102,9 +103,9 @@ private:
bool IsTrivialInst(MachineInstr &MI) const {
switch (MI.getOpcode()) {
- case AMDGPU::KILL:
- case AMDGPU::RETURN:
- case AMDGPU::IMPLICIT_DEF:
+ case R600::KILL:
+ case R600::RETURN:
+ case R600::IMPLICIT_DEF:
return true;
default:
return false;
@@ -131,16 +132,16 @@ private:
bool UpdateInstr = true) const {
std::vector<std::pair<unsigned, unsigned>> UsedKCache;
- if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4)
+ if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != R600::DOT_4)
return true;
const SmallVectorImpl<std::pair<MachineOperand *, int64_t>> &Consts =
TII->getSrcs(MI);
assert(
- (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == AMDGPU::DOT_4) &&
+ (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == R600::DOT_4) &&
"Can't assign Const");
for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
- if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+ if (Consts[i].first->getReg() != R600::ALU_CONST)
continue;
unsigned Sel = Consts[i].second;
unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
@@ -171,16 +172,16 @@ private:
return true;
for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
- if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+ if (Consts[i].first->getReg() != R600::ALU_CONST)
continue;
switch(UsedKCache[j].first) {
case 0:
Consts[i].first->setReg(
- AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
+ R600::R600_KC0RegClass.getRegister(UsedKCache[j].second));
break;
case 1:
Consts[i].first->setReg(
- AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
+ R600::R600_KC1RegClass.getRegister(UsedKCache[j].second));
break;
default:
llvm_unreachable("Wrong Cache Line");
@@ -252,7 +253,7 @@ private:
break;
if (AluInstCount > TII->getMaxAlusPerClause())
break;
- if (I->getOpcode() == AMDGPU::PRED_X) {
+ if (I->getOpcode() == R600::PRED_X) {
// We put PRED_X in its own clause to ensure that ifcvt won't create
// clauses with more than 128 insts.
// IfCvt is indeed checking that "then" and "else" branches of an if
@@ -288,7 +289,7 @@ private:
AluInstCount += OccupiedDwords(*I);
}
unsigned Opcode = PushBeforeModifier ?
- AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
+ R600::CF_ALU_PUSH_BEFORE : R600::CF_ALU;
BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
// We don't use the ADDR field until R600ControlFlowFinalizer pass, where
// it is safe to assume it is 0. However if we always put 0 here, the ifcvt
@@ -321,7 +322,7 @@ public:
BB != BB_E; ++BB) {
MachineBasicBlock &MBB = *BB;
MachineBasicBlock::iterator I = MBB.begin();
- if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU)
+ if (I != MBB.end() && I->getOpcode() == R600::CF_ALU)
continue; // BB was already parsed
for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
if (isALU(*I)) {
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index ffea231ee4d0..b924ff019dd1 100644
--- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -21,6 +21,7 @@
#include "R600RegisterInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -95,16 +96,16 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
// Expand LDS_*_RET instructions
if (TII->isLDSRetInstr(MI.getOpcode())) {
- int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+ int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst);
assert(DstIdx != -1);
MachineOperand &DstOp = MI.getOperand(DstIdx);
MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
- DstOp.getReg(), AMDGPU::OQAP);
- DstOp.setReg(AMDGPU::OQAP);
+ DstOp.getReg(), R600::OQAP);
+ DstOp.setReg(R600::OQAP);
int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::pred_sel);
+ R600::OpName::pred_sel);
int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
- AMDGPU::OpName::pred_sel);
+ R600::OpName::pred_sel);
// Copy the pred_sel bit
Mov->getOperand(MovPredSelIdx).setReg(
MI.getOperand(LDSPredSelIdx).getReg());
@@ -113,7 +114,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
switch (MI.getOpcode()) {
default: break;
// Expand PRED_X to one of the PRED_SET instructions.
- case AMDGPU::PRED_X: {
+ case R600::PRED_X: {
uint64_t Flags = MI.getOperand(3).getImm();
// The native opcode used by PRED_X is stored as an immediate in the
// third operand.
@@ -121,17 +122,18 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
MI.getOperand(2).getImm(), // opcode
MI.getOperand(0).getReg(), // dst
MI.getOperand(1).getReg(), // src0
- AMDGPU::ZERO); // src1
+ R600::ZERO); // src1
TII->addFlag(*PredSet, 0, MO_FLAG_MASK);
if (Flags & MO_FLAG_PUSH) {
- TII->setImmOperand(*PredSet, AMDGPU::OpName::update_exec_mask, 1);
+ TII->setImmOperand(*PredSet, R600::OpName::update_exec_mask, 1);
} else {
- TII->setImmOperand(*PredSet, AMDGPU::OpName::update_pred, 1);
+ TII->setImmOperand(*PredSet, R600::OpName::update_pred, 1);
}
MI.eraseFromParent();
continue;
}
- case AMDGPU::DOT_4: {
+ case R600::DOT_4: {
+
const R600RegisterInfo &TRI = TII->getRegisterInfo();
unsigned DstReg = MI.getOperand(0).getReg();
@@ -140,7 +142,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
for (unsigned Chan = 0; Chan < 4; ++Chan) {
bool Mask = (Chan != TRI.getHWRegChan(DstReg));
unsigned SubDstReg =
- AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+ R600::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
MachineInstr *BMI =
TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
if (Chan > 0) {
@@ -155,10 +157,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
// While not strictly necessary from hw point of view, we force
// all src operands of a dot4 inst to belong to the same slot.
unsigned Src0 = BMI->getOperand(
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
+ TII->getOperandIdx(Opcode, R600::OpName::src0))
.getReg();
unsigned Src1 = BMI->getOperand(
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
+ TII->getOperandIdx(Opcode, R600::OpName::src1))
.getReg();
(void) Src0;
(void) Src1;
@@ -205,26 +207,26 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
// T0_W = CUBE T1_Y, T1_Z
for (unsigned Chan = 0; Chan < 4; Chan++) {
unsigned DstReg = MI.getOperand(
- TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
+ TII->getOperandIdx(MI, R600::OpName::dst)).getReg();
unsigned Src0 = MI.getOperand(
- TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
+ TII->getOperandIdx(MI, R600::OpName::src0)).getReg();
unsigned Src1 = 0;
// Determine the correct source registers
if (!IsCube) {
- int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
+ int Src1Idx = TII->getOperandIdx(MI, R600::OpName::src1);
if (Src1Idx != -1) {
Src1 = MI.getOperand(Src1Idx).getReg();
}
}
if (IsReduction) {
- unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+ unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan);
Src0 = TRI.getSubReg(Src0, SubRegIndex);
Src1 = TRI.getSubReg(Src1, SubRegIndex);
} else if (IsCube) {
static const int CubeSrcSwz[] = {2, 2, 0, 1};
- unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
- unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
+ unsigned SubRegIndex0 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[Chan]);
+ unsigned SubRegIndex1 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
Src1 = TRI.getSubReg(Src0, SubRegIndex1);
Src0 = TRI.getSubReg(Src0, SubRegIndex0);
}
@@ -233,14 +235,14 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
bool Mask = false;
bool NotLast = true;
if (IsCube) {
- unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+ unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan);
DstReg = TRI.getSubReg(DstReg, SubRegIndex);
} else {
// Mask the write if the original instruction does not write to
// the current Channel.
Mask = (Chan != TRI.getHWRegChan(DstReg));
unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
- DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+ DstReg = R600::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
}
// Set the IsLast bit
@@ -249,11 +251,11 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
// Add the new instruction
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
- case AMDGPU::CUBE_r600_pseudo:
- Opcode = AMDGPU::CUBE_r600_real;
+ case R600::CUBE_r600_pseudo:
+ Opcode = R600::CUBE_r600_real;
break;
- case AMDGPU::CUBE_eg_pseudo:
- Opcode = AMDGPU::CUBE_eg_real;
+ case R600::CUBE_eg_pseudo:
+ Opcode = R600::CUBE_eg_real;
break;
default:
break;
@@ -270,12 +272,12 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
if (NotLast) {
TII->addFlag(*NewMI, 0, MO_FLAG_NOT_LAST);
}
- SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
- SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
- SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
- SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
- SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
- SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
+ SetFlagInNewMI(NewMI, &MI, R600::OpName::clamp);
+ SetFlagInNewMI(NewMI, &MI, R600::OpName::literal);
+ SetFlagInNewMI(NewMI, &MI, R600::OpName::src0_abs);
+ SetFlagInNewMI(NewMI, &MI, R600::OpName::src1_abs);
+ SetFlagInNewMI(NewMI, &MI, R600::OpName::src0_neg);
+ SetFlagInNewMI(NewMI, &MI, R600::OpName::src1_neg);
}
MI.eraseFromParent();
}
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 66291d0be4e6..113d6249fa60 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -8,18 +8,18 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Custom DAG lowering for R600
+/// Custom DAG lowering for R600
//
//===----------------------------------------------------------------------===//
#include "R600ISelLowering.h"
#include "AMDGPUFrameLowering.h"
-#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "R600Defines.h"
#include "R600FrameLowering.h"
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
@@ -35,13 +35,13 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include <cassert>
#include <cstdint>
#include <iterator>
@@ -50,17 +50,19 @@
using namespace llvm;
+#include "R600GenCallingConv.inc"
+
R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
const R600Subtarget &STI)
- : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
- addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
- addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
- addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
- addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
- addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
- addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
+ : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) {
+ addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass);
+ addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass);
+ addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass);
+ addRegisterClass(MVT::v2i32, &R600::R600_Reg64RegClass);
+ addRegisterClass(MVT::v4f32, &R600::R600_Reg128RegClass);
+ addRegisterClass(MVT::v4i32, &R600::R600_Reg128RegClass);
- computeRegisterProperties(STI.getRegisterInfo());
+ computeRegisterProperties(Subtarget->getRegisterInfo());
// Legalize loads and stores to the private address space.
setOperationAction(ISD::LOAD, MVT::i32, Custom);
@@ -147,6 +149,11 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSUB, MVT::f32, Expand);
+ setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+ setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+ setOperationAction(ISD::FRINT, MVT::f64, Custom);
+ setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
+
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
@@ -216,6 +223,34 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::f64, Expand);
}
+ // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
+ // need it for R600.
+ if (!Subtarget->hasFP32Denormals())
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+ if (!Subtarget->hasBFI()) {
+ // fcopysign can be done in a single instruction with BFI.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ }
+
+ if (!Subtarget->hasBCNT(32))
+ setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+ if (!Subtarget->hasBCNT(64))
+ setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+ if (Subtarget->hasFFBH())
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+
+ if (Subtarget->hasFFBL())
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+
+ // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
+ // need it for R600.
+ if (Subtarget->hasBFE())
+ setHasExtractBitsInsn(true);
+
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
@@ -245,14 +280,10 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::LOAD);
}
-const R600Subtarget *R600TargetLowering::getSubtarget() const {
- return static_cast<const R600Subtarget *>(Subtarget);
-}
-
static inline bool isEOP(MachineBasicBlock::iterator I) {
if (std::next(I) == I->getParent()->end())
return false;
- return std::next(I)->getOpcode() == AMDGPU::RETURN;
+ return std::next(I)->getOpcode() == R600::RETURN;
}
MachineBasicBlock *
@@ -261,24 +292,24 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock::iterator I = MI;
- const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+ const R600InstrInfo *TII = Subtarget->getInstrInfo();
switch (MI.getOpcode()) {
default:
// Replace LDS_*_RET instruction that don't have any uses with the
// equivalent LDS_*_NORET instruction.
if (TII->isLDSRetInstr(MI.getOpcode())) {
- int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+ int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst);
assert(DstIdx != -1);
MachineInstrBuilder NewMI;
// FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
// LDS_1A2D support and remove this special case.
if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
- MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
+ MI.getOpcode() == R600::LDS_CMPST_RET)
return BB;
NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
- TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
+ TII->get(R600::getLDSNoRetOp(MI.getOpcode())));
for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
NewMI.add(MI.getOperand(i));
}
@@ -286,31 +317,24 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
break;
- case AMDGPU::CLAMP_R600: {
- MachineInstr *NewMI = TII->buildDefaultInstruction(
- *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
- MI.getOperand(1).getReg());
- TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
- break;
- }
- case AMDGPU::FABS_R600: {
+ case R600::FABS_R600: {
MachineInstr *NewMI = TII->buildDefaultInstruction(
- *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+ *BB, I, R600::MOV, MI.getOperand(0).getReg(),
MI.getOperand(1).getReg());
TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
break;
}
- case AMDGPU::FNEG_R600: {
+ case R600::FNEG_R600: {
MachineInstr *NewMI = TII->buildDefaultInstruction(
- *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+ *BB, I, R600::MOV, MI.getOperand(0).getReg(),
MI.getOperand(1).getReg());
TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
break;
}
- case AMDGPU::MASK_WRITE: {
+ case R600::MASK_WRITE: {
unsigned maskedRegister = MI.getOperand(0).getReg();
assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
@@ -318,7 +342,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
break;
}
- case AMDGPU::MOV_IMM_F32:
+ case R600::MOV_IMM_F32:
TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
.getFPImm()
->getValueAPF()
@@ -326,39 +350,39 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.getZExtValue());
break;
- case AMDGPU::MOV_IMM_I32:
+ case R600::MOV_IMM_I32:
TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
MI.getOperand(1).getImm());
break;
- case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
+ case R600::MOV_IMM_GLOBAL_ADDR: {
//TODO: Perhaps combine this instruction with the next if possible
auto MIB = TII->buildDefaultInstruction(
- *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
- int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
+ *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_LITERAL_X);
+ int Idx = TII->getOperandIdx(*MIB, R600::OpName::literal);
//TODO: Ugh this is rather ugly
MIB->getOperand(Idx) = MI.getOperand(1);
break;
}
- case AMDGPU::CONST_COPY: {
+ case R600::CONST_COPY: {
MachineInstr *NewMI = TII->buildDefaultInstruction(
- *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
- TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
+ *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_CONST);
+ TII->setImmOperand(*NewMI, R600::OpName::src0_sel,
MI.getOperand(1).getImm());
break;
}
- case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
- case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
- case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
+ case R600::RAT_WRITE_CACHELESS_32_eg:
+ case R600::RAT_WRITE_CACHELESS_64_eg:
+ case R600::RAT_WRITE_CACHELESS_128_eg:
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
.add(MI.getOperand(0))
.add(MI.getOperand(1))
.addImm(isEOP(I)); // Set End of program bit
break;
- case AMDGPU::RAT_STORE_TYPED_eg:
+ case R600::RAT_STORE_TYPED_eg:
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
.add(MI.getOperand(0))
.add(MI.getOperand(1))
@@ -366,49 +390,49 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addImm(isEOP(I)); // Set End of program bit
break;
- case AMDGPU::BRANCH:
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+ case R600::BRANCH:
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP))
.add(MI.getOperand(0));
break;
- case AMDGPU::BRANCH_COND_f32: {
+ case R600::BRANCH_COND_f32: {
MachineInstr *NewMI =
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
- AMDGPU::PREDICATE_BIT)
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
+ R600::PREDICATE_BIT)
.add(MI.getOperand(1))
- .addImm(AMDGPU::PRED_SETNE)
+ .addImm(R600::PRED_SETNE)
.addImm(0); // Flags
TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
.add(MI.getOperand(0))
- .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+ .addReg(R600::PREDICATE_BIT, RegState::Kill);
break;
}
- case AMDGPU::BRANCH_COND_i32: {
+ case R600::BRANCH_COND_i32: {
MachineInstr *NewMI =
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
- AMDGPU::PREDICATE_BIT)
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
+ R600::PREDICATE_BIT)
.add(MI.getOperand(1))
- .addImm(AMDGPU::PRED_SETNE_INT)
+ .addImm(R600::PRED_SETNE_INT)
.addImm(0); // Flags
TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
.add(MI.getOperand(0))
- .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+ .addReg(R600::PREDICATE_BIT, RegState::Kill);
break;
}
- case AMDGPU::EG_ExportSwz:
- case AMDGPU::R600_ExportSwz: {
+ case R600::EG_ExportSwz:
+ case R600::R600_ExportSwz: {
// Instruction is left unmodified if its not the last one of its type
bool isLastInstructionOfItsType = true;
unsigned InstExportType = MI.getOperand(1).getImm();
for (MachineBasicBlock::iterator NextExportInst = std::next(I),
EndBlock = BB->end(); NextExportInst != EndBlock;
NextExportInst = std::next(NextExportInst)) {
- if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
- NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
+ if (NextExportInst->getOpcode() == R600::EG_ExportSwz ||
+ NextExportInst->getOpcode() == R600::R600_ExportSwz) {
unsigned CurrentInstExportType = NextExportInst->getOperand(1)
.getImm();
if (CurrentInstExportType == InstExportType) {
@@ -420,7 +444,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
bool EOP = isEOP(I);
if (!EOP && !isLastInstructionOfItsType)
return BB;
- unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
+ unsigned CfInst = (MI.getOpcode() == R600::EG_ExportSwz) ? 84 : 40;
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
.add(MI.getOperand(0))
.add(MI.getOperand(1))
@@ -433,7 +457,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addImm(EOP);
break;
}
- case AMDGPU::RETURN: {
+ case R600::RETURN: {
return BB;
}
}
@@ -478,7 +502,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
unsigned IntrinsicID =
cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
switch (IntrinsicID) {
- case AMDGPUIntrinsic::r600_store_swizzle: {
+ case Intrinsic::r600_store_swizzle: {
SDLoc DL(Op);
const SDValue Args[8] = {
Chain,
@@ -505,14 +529,14 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
EVT VT = Op.getValueType();
SDLoc DL(Op);
switch (IntrinsicID) {
- case AMDGPUIntrinsic::r600_tex:
- case AMDGPUIntrinsic::r600_texc: {
+ case Intrinsic::r600_tex:
+ case Intrinsic::r600_texc: {
unsigned TextureOp;
switch (IntrinsicID) {
- case AMDGPUIntrinsic::r600_tex:
+ case Intrinsic::r600_tex:
TextureOp = 0;
break;
- case AMDGPUIntrinsic::r600_texc:
+ case Intrinsic::r600_texc:
TextureOp = 1;
break;
default:
@@ -542,7 +566,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
};
return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
}
- case AMDGPUIntrinsic::r600_dot4: {
+ case Intrinsic::r600_dot4: {
SDValue Args[8] = {
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
DAG.getConstant(0, DL, MVT::i32)),
@@ -566,7 +590,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
case Intrinsic::r600_implicitarg_ptr: {
MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS);
- uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+ uint32_t ByteOffset = getImplicitParameterOffset(MF, FIRST_IMPLICIT);
return DAG.getConstant(ByteOffset, DL, PtrVT);
}
case Intrinsic::r600_read_ngroups_x:
@@ -589,23 +613,23 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
return LowerImplicitParameter(DAG, VT, DL, 8);
case Intrinsic::r600_read_tgid_x:
- return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T1_X, VT);
+ return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+ R600::T1_X, VT);
case Intrinsic::r600_read_tgid_y:
- return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T1_Y, VT);
+ return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+ R600::T1_Y, VT);
case Intrinsic::r600_read_tgid_z:
- return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T1_Z, VT);
+ return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+ R600::T1_Z, VT);
case Intrinsic::r600_read_tidig_x:
- return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T0_X, VT);
+ return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+ R600::T0_X, VT);
case Intrinsic::r600_read_tidig_y:
- return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T0_Y, VT);
+ return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+ R600::T0_Y, VT);
case Intrinsic::r600_read_tidig_z:
- return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T0_Z, VT);
+ return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+ R600::T0_Z, VT);
case Intrinsic::r600_recipsqrt_ieee:
return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
@@ -755,7 +779,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
DAG.getNode(ISD::FADD, DL, VT, FractPart,
DAG.getConstantFP(-0.5, DL, MVT::f32)));
- if (Gen >= R600Subtarget::R700)
+ if (Gen >= AMDGPUSubtarget::R700)
return TrigVal;
// On R600 hw, COS/SIN input must be between -Pi and Pi.
return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
@@ -1527,7 +1551,7 @@ SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
- const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
+ const R600FrameLowering *TFL = Subtarget->getFrameLowering();
FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
@@ -1539,6 +1563,28 @@ SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
Op.getValueType());
}
+CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+ bool IsVarArg) const {
+ switch (CC) {
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ case CallingConv::C:
+ case CallingConv::Fast:
+ case CallingConv::Cold:
+ llvm_unreachable("kernels should not be handled here");
+ case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_GS:
+ case CallingConv::AMDGPU_PS:
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_ES:
+ case CallingConv::AMDGPU_LS:
+ return CC_R600;
+ default:
+ report_fatal_error("Unsupported calling convention.");
+ }
+}
+
/// XXX Only kernel functions are supported, so we can assume for now that
/// every function is a kernel function, but in the future we should use
/// separate calling conventions for kernel and non-kernel functions.
@@ -1550,8 +1596,6 @@ SDValue R600TargetLowering::LowerFormalArguments(
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
MachineFunction &MF = DAG.getMachineFunction();
- R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-
SmallVector<ISD::InputArg, 8> LocalIns;
if (AMDGPU::isShader(CallConv)) {
@@ -1571,7 +1615,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
}
if (AMDGPU::isShader(CallConv)) {
- unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass);
SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
InVals.push_back(Register);
continue;
@@ -1602,19 +1646,18 @@ SDValue R600TargetLowering::LowerFormalArguments(
unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
unsigned PartOffset = VA.getLocMemOffset();
- unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset();
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
SDValue Arg = DAG.getLoad(
ISD::UNINDEXED, Ext, VT, DL, Chain,
- DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
+ DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32),
+ PtrInfo,
MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
// 4 is the preferred alignment for the CONSTANT memory space.
InVals.push_back(Arg);
- MFI->setABIArgOffset(Offset + MemVT.getStoreSize());
}
return Chain;
}
@@ -1989,26 +2032,26 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
SDValue &Src, SDValue &Neg, SDValue &Abs,
SDValue &Sel, SDValue &Imm,
SelectionDAG &DAG) const {
- const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+ const R600InstrInfo *TII = Subtarget->getInstrInfo();
if (!Src.isMachineOpcode())
return false;
switch (Src.getMachineOpcode()) {
- case AMDGPU::FNEG_R600:
+ case R600::FNEG_R600:
if (!Neg.getNode())
return false;
Src = Src.getOperand(0);
Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
return true;
- case AMDGPU::FABS_R600:
+ case R600::FABS_R600:
if (!Abs.getNode())
return false;
Src = Src.getOperand(0);
Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
return true;
- case AMDGPU::CONST_COPY: {
+ case R600::CONST_COPY: {
unsigned Opcode = ParentNode->getMachineOpcode();
- bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+ bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
if (!Sel.getNode())
return false;
@@ -2019,17 +2062,17 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
// Gather constants values
int SrcIndices[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+ TII->getOperandIdx(Opcode, R600::OpName::src0),
+ TII->getOperandIdx(Opcode, R600::OpName::src1),
+ TII->getOperandIdx(Opcode, R600::OpName::src2),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_W),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_W)
};
std::vector<unsigned> Consts;
for (int OtherSrcIdx : SrcIndices) {
@@ -2042,7 +2085,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
}
if (RegisterSDNode *Reg =
dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
- if (Reg->getReg() == AMDGPU::ALU_CONST) {
+ if (Reg->getReg() == R600::ALU_CONST) {
ConstantSDNode *Cst
= cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
Consts.push_back(Cst->getZExtValue());
@@ -2057,30 +2100,30 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
}
Sel = CstOffset;
- Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
+ Src = DAG.getRegister(R600::ALU_CONST, MVT::f32);
return true;
}
- case AMDGPU::MOV_IMM_GLOBAL_ADDR:
+ case R600::MOV_IMM_GLOBAL_ADDR:
// Check if the Imm slot is used. Taken from below.
if (cast<ConstantSDNode>(Imm)->getZExtValue())
return false;
Imm = Src.getOperand(0);
- Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
+ Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32);
return true;
- case AMDGPU::MOV_IMM_I32:
- case AMDGPU::MOV_IMM_F32: {
- unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+ case R600::MOV_IMM_I32:
+ case R600::MOV_IMM_F32: {
+ unsigned ImmReg = R600::ALU_LITERAL_X;
uint64_t ImmValue = 0;
- if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
+ if (Src.getMachineOpcode() == R600::MOV_IMM_F32) {
ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
float FloatValue = FPC->getValueAPF().convertToFloat();
if (FloatValue == 0.0) {
- ImmReg = AMDGPU::ZERO;
+ ImmReg = R600::ZERO;
} else if (FloatValue == 0.5) {
- ImmReg = AMDGPU::HALF;
+ ImmReg = R600::HALF;
} else if (FloatValue == 1.0) {
- ImmReg = AMDGPU::ONE;
+ ImmReg = R600::ONE;
} else {
ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
}
@@ -2088,9 +2131,9 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
uint64_t Value = C->getZExtValue();
if (Value == 0) {
- ImmReg = AMDGPU::ZERO;
+ ImmReg = R600::ZERO;
} else if (Value == 1) {
- ImmReg = AMDGPU::ONE_INT;
+ ImmReg = R600::ONE_INT;
} else {
ImmValue = Value;
}
@@ -2099,7 +2142,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
// Check that we aren't already using an immediate.
// XXX: It's possible for an instruction to have more than one
// immediate operand, but this is not supported yet.
- if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+ if (ImmReg == R600::ALU_LITERAL_X) {
if (!Imm.getNode())
return false;
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
@@ -2116,10 +2159,10 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
}
}
-/// \brief Fold the instructions after selecting them
+/// Fold the instructions after selecting them
SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
- const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+ const R600InstrInfo *TII = Subtarget->getInstrInfo();
if (!Node->isMachineOpcode())
return Node;
@@ -2128,36 +2171,36 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
- if (Opcode == AMDGPU::DOT_4) {
+ if (Opcode == R600::DOT_4) {
int OperandIdx[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+ TII->getOperandIdx(Opcode, R600::OpName::src0_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_W),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_W)
};
int NegIdx[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
+ TII->getOperandIdx(Opcode, R600::OpName::src0_neg_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_neg_W),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_neg_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_neg_W)
};
int AbsIdx[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
+ TII->getOperandIdx(Opcode, R600::OpName::src0_abs_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_abs_W),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_abs_X),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Y),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Z),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_abs_W)
};
for (unsigned i = 0; i < 8; i++) {
if (OperandIdx[i] < 0)
@@ -2165,7 +2208,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
SDValue &Src = Ops[OperandIdx[i] - 1];
SDValue &Neg = Ops[NegIdx[i] - 1];
SDValue &Abs = Ops[AbsIdx[i] - 1];
- bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+ bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
if (HasDst)
SelIdx--;
@@ -2173,42 +2216,28 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
}
- } else if (Opcode == AMDGPU::REG_SEQUENCE) {
+ } else if (Opcode == R600::REG_SEQUENCE) {
for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
SDValue &Src = Ops[i];
if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
}
- } else if (Opcode == AMDGPU::CLAMP_R600) {
- SDValue Src = Node->getOperand(0);
- if (!Src.isMachineOpcode() ||
- !TII->hasInstrModifiers(Src.getMachineOpcode()))
- return Node;
- int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
- AMDGPU::OpName::clamp);
- if (ClampIdx < 0)
- return Node;
- SDLoc DL(Node);
- std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
- Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
- return DAG.getMachineNode(Src.getMachineOpcode(), DL,
- Node->getVTList(), Ops);
} else {
if (!TII->hasInstrModifiers(Opcode))
return Node;
int OperandIdx[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
+ TII->getOperandIdx(Opcode, R600::OpName::src0),
+ TII->getOperandIdx(Opcode, R600::OpName::src1),
+ TII->getOperandIdx(Opcode, R600::OpName::src2)
};
int NegIdx[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
+ TII->getOperandIdx(Opcode, R600::OpName::src0_neg),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_neg),
+ TII->getOperandIdx(Opcode, R600::OpName::src2_neg)
};
int AbsIdx[] = {
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
+ TII->getOperandIdx(Opcode, R600::OpName::src0_abs),
+ TII->getOperandIdx(Opcode, R600::OpName::src1_abs),
-1
};
for (unsigned i = 0; i < 3; i++) {
@@ -2218,9 +2247,9 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
SDValue &Neg = Ops[NegIdx[i] - 1];
SDValue FakeAbs;
SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
- bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+ bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
- int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
+ int ImmIdx = TII->getOperandIdx(Opcode, R600::OpName::literal);
if (HasDst) {
SelIdx--;
ImmIdx--;
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 2a774693f02b..907d1f10e151 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief R600 DAG Lowering interface definition
+/// R600 DAG Lowering interface definition
//
//===----------------------------------------------------------------------===//
@@ -23,6 +23,8 @@ class R600InstrInfo;
class R600Subtarget;
class R600TargetLowering final : public AMDGPUTargetLowering {
+
+ const R600Subtarget *Subtarget;
public:
R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI);
@@ -36,6 +38,7 @@ public:
void ReplaceNodeResults(SDNode * N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
+ CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
diff --git a/lib/Target/AMDGPU/R600InstrFormats.td b/lib/Target/AMDGPU/R600InstrFormats.td
index 61106ed42e64..687a9affa138 100644
--- a/lib/Target/AMDGPU/R600InstrFormats.td
+++ b/lib/Target/AMDGPU/R600InstrFormats.td
@@ -11,10 +11,10 @@
//
//===----------------------------------------------------------------------===//
-def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">;
+def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">;
def isR600toCayman : Predicate<
- "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">;
+ "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
class R600Pat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
let SubtargetPredicate = isR600toCayman;
@@ -41,7 +41,7 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
bit LDS_1A2D = 0;
let SubtargetPredicate = isR600toCayman;
- let Namespace = "AMDGPU";
+ let Namespace = "R600";
let OutOperandList = outs;
let InOperandList = ins;
let AsmString = asm;
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index 23e646c8147c..5397e779474c 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief R600 Implementation of TargetInstrInfo.
+/// R600 Implementation of TargetInstrInfo.
//
//===----------------------------------------------------------------------===//
@@ -19,6 +19,7 @@
#include "R600Defines.h"
#include "R600FrameLowering.h"
#include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallSet.h"
@@ -44,10 +45,15 @@
using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
-#include "AMDGPUGenDFAPacketizer.inc"
+#include "R600GenDFAPacketizer.inc"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRMAP_INFO
+#define GET_INSTRINFO_NAMED_OPS
+#include "R600GenInstrInfo.inc"
R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
- : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
+ : R600GenInstrInfo(-1, -1), RI(), ST(ST) {}
bool R600InstrInfo::isVector(const MachineInstr &MI) const {
return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
@@ -58,31 +64,31 @@ void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
const DebugLoc &DL, unsigned DestReg,
unsigned SrcReg, bool KillSrc) const {
unsigned VectorComponents = 0;
- if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
- AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
- (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
- AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
+ if ((R600::R600_Reg128RegClass.contains(DestReg) ||
+ R600::R600_Reg128VerticalRegClass.contains(DestReg)) &&
+ (R600::R600_Reg128RegClass.contains(SrcReg) ||
+ R600::R600_Reg128VerticalRegClass.contains(SrcReg))) {
VectorComponents = 4;
- } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
- AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
- (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
- AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
+ } else if((R600::R600_Reg64RegClass.contains(DestReg) ||
+ R600::R600_Reg64VerticalRegClass.contains(DestReg)) &&
+ (R600::R600_Reg64RegClass.contains(SrcReg) ||
+ R600::R600_Reg64VerticalRegClass.contains(SrcReg))) {
VectorComponents = 2;
}
if (VectorComponents > 0) {
for (unsigned I = 0; I < VectorComponents; I++) {
- unsigned SubRegIndex = RI.getSubRegFromChannel(I);
- buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+ unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(I);
+ buildDefaultInstruction(MBB, MI, R600::MOV,
RI.getSubReg(DestReg, SubRegIndex),
RI.getSubReg(SrcReg, SubRegIndex))
.addReg(DestReg,
RegState::Define | RegState::Implicit);
}
} else {
- MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+ MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, R600::MOV,
DestReg, SrcReg);
- NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0))
+ NewMI->getOperand(getOperandIdx(*NewMI, R600::OpName::src0))
.setIsKill(KillSrc);
}
}
@@ -103,9 +109,9 @@ bool R600InstrInfo::isMov(unsigned Opcode) const {
switch(Opcode) {
default:
return false;
- case AMDGPU::MOV:
- case AMDGPU::MOV_IMM_F32:
- case AMDGPU::MOV_IMM_I32:
+ case R600::MOV:
+ case R600::MOV_IMM_F32:
+ case R600::MOV_IMM_I32:
return true;
}
}
@@ -117,10 +123,10 @@ bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
switch(Opcode) {
default: return false;
- case AMDGPU::CUBE_r600_pseudo:
- case AMDGPU::CUBE_r600_real:
- case AMDGPU::CUBE_eg_pseudo:
- case AMDGPU::CUBE_eg_real:
+ case R600::CUBE_r600_pseudo:
+ case R600::CUBE_r600_real:
+ case R600::CUBE_eg_pseudo:
+ case R600::CUBE_eg_real:
return true;
}
}
@@ -148,7 +154,7 @@ bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
}
bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
- return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
+ return isLDSInstr(Opcode) && getOperandIdx(Opcode, R600::OpName::dst) != -1;
}
bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
@@ -157,12 +163,12 @@ bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
if (isVector(MI) || isCubeOp(MI.getOpcode()))
return true;
switch (MI.getOpcode()) {
- case AMDGPU::PRED_X:
- case AMDGPU::INTERP_PAIR_XY:
- case AMDGPU::INTERP_PAIR_ZW:
- case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::COPY:
- case AMDGPU::DOT_4:
+ case R600::PRED_X:
+ case R600::INTERP_PAIR_XY:
+ case R600::INTERP_PAIR_ZW:
+ case R600::INTERP_VEC_LOAD:
+ case R600::COPY:
+ case R600::DOT_4:
return true;
default:
return false;
@@ -172,7 +178,7 @@ bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
if (ST.hasCaymanISA())
return false;
- return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
+ return (get(Opcode).getSchedClass() == R600::Sched::TransALU);
}
bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const {
@@ -180,7 +186,7 @@ bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const {
}
bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
- return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
+ return (get(Opcode).getSchedClass() == R600::Sched::VecALU);
}
bool R600InstrInfo::isVectorOnly(const MachineInstr &MI) const {
@@ -214,8 +220,8 @@ bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
switch (Opcode) {
- case AMDGPU::KILLGT:
- case AMDGPU::GROUP_BARRIER:
+ case R600::KILLGT:
+ case R600::GROUP_BARRIER:
return true;
default:
return false;
@@ -223,11 +229,11 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
}
bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const {
- return MI.findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
+ return MI.findRegisterUseOperandIdx(R600::AR_X) != -1;
}
bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const {
- return MI.findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
+ return MI.findRegisterDefOperandIdx(R600::AR_X) != -1;
}
bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
@@ -241,7 +247,7 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
TargetRegisterInfo::isVirtualRegister(I->getReg()))
continue;
- if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
+ if (R600::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
return true;
}
return false;
@@ -249,17 +255,17 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
static const unsigned SrcSelTable[][2] = {
- {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
- {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
- {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
- {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
- {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
- {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
- {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
- {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
- {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
- {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
- {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}
+ {R600::OpName::src0, R600::OpName::src0_sel},
+ {R600::OpName::src1, R600::OpName::src1_sel},
+ {R600::OpName::src2, R600::OpName::src2_sel},
+ {R600::OpName::src0_X, R600::OpName::src0_sel_X},
+ {R600::OpName::src0_Y, R600::OpName::src0_sel_Y},
+ {R600::OpName::src0_Z, R600::OpName::src0_sel_Z},
+ {R600::OpName::src0_W, R600::OpName::src0_sel_W},
+ {R600::OpName::src1_X, R600::OpName::src1_sel_X},
+ {R600::OpName::src1_Y, R600::OpName::src1_sel_Y},
+ {R600::OpName::src1_Z, R600::OpName::src1_sel_Z},
+ {R600::OpName::src1_W, R600::OpName::src1_sel_W}
};
for (const auto &Row : SrcSelTable) {
@@ -274,23 +280,23 @@ SmallVector<std::pair<MachineOperand *, int64_t>, 3>
R600InstrInfo::getSrcs(MachineInstr &MI) const {
SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
- if (MI.getOpcode() == AMDGPU::DOT_4) {
+ if (MI.getOpcode() == R600::DOT_4) {
static const unsigned OpTable[8][2] = {
- {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
- {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
- {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
- {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
- {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
- {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
- {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
- {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W},
+ {R600::OpName::src0_X, R600::OpName::src0_sel_X},
+ {R600::OpName::src0_Y, R600::OpName::src0_sel_Y},
+ {R600::OpName::src0_Z, R600::OpName::src0_sel_Z},
+ {R600::OpName::src0_W, R600::OpName::src0_sel_W},
+ {R600::OpName::src1_X, R600::OpName::src1_sel_X},
+ {R600::OpName::src1_Y, R600::OpName::src1_sel_Y},
+ {R600::OpName::src1_Z, R600::OpName::src1_sel_Z},
+ {R600::OpName::src1_W, R600::OpName::src1_sel_W},
};
for (unsigned j = 0; j < 8; j++) {
MachineOperand &MO =
MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
unsigned Reg = MO.getReg();
- if (Reg == AMDGPU::ALU_CONST) {
+ if (Reg == R600::ALU_CONST) {
MachineOperand &Sel =
MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
Result.push_back(std::make_pair(&MO, Sel.getImm()));
@@ -302,9 +308,9 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
}
static const unsigned OpTable[3][2] = {
- {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
- {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
- {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+ {R600::OpName::src0, R600::OpName::src0_sel},
+ {R600::OpName::src1, R600::OpName::src1_sel},
+ {R600::OpName::src2, R600::OpName::src2_sel},
};
for (unsigned j = 0; j < 3; j++) {
@@ -313,15 +319,15 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
break;
MachineOperand &MO = MI.getOperand(SrcIdx);
unsigned Reg = MO.getReg();
- if (Reg == AMDGPU::ALU_CONST) {
+ if (Reg == R600::ALU_CONST) {
MachineOperand &Sel =
MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
Result.push_back(std::make_pair(&MO, Sel.getImm()));
continue;
}
- if (Reg == AMDGPU::ALU_LITERAL_X) {
+ if (Reg == R600::ALU_LITERAL_X) {
MachineOperand &Operand =
- MI.getOperand(getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+ MI.getOperand(getOperandIdx(MI.getOpcode(), R600::OpName::literal));
if (Operand.isImm()) {
Result.push_back(std::make_pair(&MO, Operand.getImm()));
continue;
@@ -345,7 +351,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI,
++i;
unsigned Reg = Src.first->getReg();
int Index = RI.getEncodingValue(Reg) & 0xff;
- if (Reg == AMDGPU::OQAP) {
+ if (Reg == R600::OQAP) {
Result.push_back(std::make_pair(Index, 0U));
}
if (PV.find(Reg) != PV.end()) {
@@ -435,7 +441,7 @@ unsigned R600InstrInfo::isLegalUpTo(
const std::pair<int, unsigned> &Src = Srcs[j];
if (Src.first < 0 || Src.first == 255)
continue;
- if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
+ if (Src.first == GET_REG_INDEX(RI.getEncodingValue(R600::OQAP))) {
if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 &&
Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) {
// The value from output queue A (denoted by register OQAP) can
@@ -541,7 +547,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
for (unsigned i = 0, e = IG.size(); i < e; ++i) {
IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount));
unsigned Op = getOperandIdx(IG[i]->getOpcode(),
- AMDGPU::OpName::bank_swizzle);
+ R600::OpName::bank_swizzle);
ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
IG[i]->getOperand(Op).getImm());
}
@@ -610,14 +616,14 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
continue;
for (const auto &Src : getSrcs(MI)) {
- if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X)
+ if (Src.first->getReg() == R600::ALU_LITERAL_X)
Literals.insert(Src.second);
if (Literals.size() > 4)
return false;
- if (Src.first->getReg() == AMDGPU::ALU_CONST)
+ if (Src.first->getReg() == R600::ALU_CONST)
Consts.push_back(Src.second);
- if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) ||
- AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) {
+ if (R600::R600_KC0RegClass.contains(Src.first->getReg()) ||
+ R600::R600_KC1RegClass.contains(Src.first->getReg())) {
unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff;
unsigned Chan = RI.getHWRegChan(Src.first->getReg());
Consts.push_back((Index << 2) | Chan);
@@ -636,7 +642,7 @@ R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const {
static bool
isPredicateSetter(unsigned Opcode) {
switch (Opcode) {
- case AMDGPU::PRED_X:
+ case R600::PRED_X:
return true;
default:
return false;
@@ -658,12 +664,12 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
static
bool isJump(unsigned Opcode) {
- return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
+ return Opcode == R600::JUMP || Opcode == R600::JUMP_COND;
}
static bool isBranch(unsigned Opcode) {
- return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 ||
- Opcode == AMDGPU::BRANCH_COND_f32;
+ return Opcode == R600::BRANCH || Opcode == R600::BRANCH_COND_i32 ||
+ Opcode == R600::BRANCH_COND_f32;
}
bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
@@ -678,7 +684,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
if (I == MBB.end())
return false;
- // AMDGPU::BRANCH* instructions are only available after isel and are not
+ // R600::BRANCH* instructions are only available after isel and are not
// handled
if (isBranch(I->getOpcode()))
return true;
@@ -687,7 +693,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
}
// Remove successive JUMP
- while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) {
+ while (I != MBB.begin() && std::prev(I)->getOpcode() == R600::JUMP) {
MachineBasicBlock::iterator PriorI = std::prev(I);
if (AllowModify)
I->removeFromParent();
@@ -698,10 +704,10 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
// If there is only one terminator instruction, process it.
unsigned LastOpc = LastInst.getOpcode();
if (I == MBB.begin() || !isJump((--I)->getOpcode())) {
- if (LastOpc == AMDGPU::JUMP) {
+ if (LastOpc == R600::JUMP) {
TBB = LastInst.getOperand(0).getMBB();
return false;
- } else if (LastOpc == AMDGPU::JUMP_COND) {
+ } else if (LastOpc == R600::JUMP_COND) {
auto predSet = I;
while (!isPredicateSetter(predSet->getOpcode())) {
predSet = --I;
@@ -709,7 +715,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
TBB = LastInst.getOperand(0).getMBB();
Cond.push_back(predSet->getOperand(1));
Cond.push_back(predSet->getOperand(2));
- Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+ Cond.push_back(MachineOperand::CreateReg(R600::PRED_SEL_ONE, false));
return false;
}
return true; // Can't handle indirect branch.
@@ -720,7 +726,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
unsigned SecondLastOpc = SecondLastInst.getOpcode();
// If the block ends with a B and a Bcc, handle it.
- if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
+ if (SecondLastOpc == R600::JUMP_COND && LastOpc == R600::JUMP) {
auto predSet = --I;
while (!isPredicateSetter(predSet->getOpcode())) {
predSet = --I;
@@ -729,7 +735,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
FBB = LastInst.getOperand(0).getMBB();
Cond.push_back(predSet->getOperand(1));
Cond.push_back(predSet->getOperand(2));
- Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+ Cond.push_back(MachineOperand::CreateReg(R600::PRED_SEL_ONE, false));
return false;
}
@@ -741,8 +747,8 @@ static
MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
It != E; ++It) {
- if (It->getOpcode() == AMDGPU::CF_ALU ||
- It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+ if (It->getOpcode() == R600::CF_ALU ||
+ It->getOpcode() == R600::CF_ALU_PUSH_BEFORE)
return It.getReverse();
}
return MBB.end();
@@ -759,7 +765,7 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
if (!FBB) {
if (Cond.empty()) {
- BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
+ BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(TBB);
return 1;
} else {
MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
@@ -767,14 +773,14 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
addFlag(*PredSet, 0, MO_FLAG_PUSH);
PredSet->getOperand(2).setImm(Cond[1].getImm());
- BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+ BuildMI(&MBB, DL, get(R600::JUMP_COND))
.addMBB(TBB)
- .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+ .addReg(R600::PREDICATE_BIT, RegState::Kill);
MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
if (CfAlu == MBB.end())
return 1;
- assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
- CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+ assert (CfAlu->getOpcode() == R600::CF_ALU);
+ CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE));
return 1;
}
} else {
@@ -782,15 +788,15 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
assert(PredSet && "No previous predicate !");
addFlag(*PredSet, 0, MO_FLAG_PUSH);
PredSet->getOperand(2).setImm(Cond[1].getImm());
- BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+ BuildMI(&MBB, DL, get(R600::JUMP_COND))
.addMBB(TBB)
- .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
- BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
+ .addReg(R600::PREDICATE_BIT, RegState::Kill);
+ BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(FBB);
MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
if (CfAlu == MBB.end())
return 2;
- assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
- CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+ assert (CfAlu->getOpcode() == R600::CF_ALU);
+ CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE));
return 2;
}
}
@@ -811,18 +817,18 @@ unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB,
switch (I->getOpcode()) {
default:
return 0;
- case AMDGPU::JUMP_COND: {
+ case R600::JUMP_COND: {
MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
clearFlag(*predSet, 0, MO_FLAG_PUSH);
I->eraseFromParent();
MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
if (CfAlu == MBB.end())
break;
- assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
- CfAlu->setDesc(get(AMDGPU::CF_ALU));
+ assert (CfAlu->getOpcode() == R600::CF_ALU_PUSH_BEFORE);
+ CfAlu->setDesc(get(R600::CF_ALU));
break;
}
- case AMDGPU::JUMP:
+ case R600::JUMP:
I->eraseFromParent();
break;
}
@@ -836,18 +842,18 @@ unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB,
// FIXME: only one case??
default:
return 1;
- case AMDGPU::JUMP_COND: {
+ case R600::JUMP_COND: {
MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
clearFlag(*predSet, 0, MO_FLAG_PUSH);
I->eraseFromParent();
MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
if (CfAlu == MBB.end())
break;
- assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
- CfAlu->setDesc(get(AMDGPU::CF_ALU));
+ assert (CfAlu->getOpcode() == R600::CF_ALU_PUSH_BEFORE);
+ CfAlu->setDesc(get(R600::CF_ALU));
break;
}
- case AMDGPU::JUMP:
+ case R600::JUMP:
I->eraseFromParent();
break;
}
@@ -862,9 +868,9 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
unsigned Reg = MI.getOperand(idx).getReg();
switch (Reg) {
default: return false;
- case AMDGPU::PRED_SEL_ONE:
- case AMDGPU::PRED_SEL_ZERO:
- case AMDGPU::PREDICATE_BIT:
+ case R600::PRED_SEL_ONE:
+ case R600::PRED_SEL_ZERO:
+ case R600::PREDICATE_BIT:
return true;
}
}
@@ -875,9 +881,9 @@ bool R600InstrInfo::isPredicable(const MachineInstr &MI) const {
// be predicated. Until we have proper support for instruction clauses in the
// backend, we will mark KILL* instructions as unpredicable.
- if (MI.getOpcode() == AMDGPU::KILLGT) {
+ if (MI.getOpcode() == R600::KILLGT) {
return false;
- } else if (MI.getOpcode() == AMDGPU::CF_ALU) {
+ } else if (MI.getOpcode() == R600::CF_ALU) {
// If the clause start in the middle of MBB then the MBB has more
// than a single clause, unable to predicate several clauses.
if (MI.getParent()->begin() != MachineBasicBlock::const_iterator(MI))
@@ -887,7 +893,7 @@ bool R600InstrInfo::isPredicable(const MachineInstr &MI) const {
} else if (isVector(MI)) {
return false;
} else {
- return AMDGPUInstrInfo::isPredicable(MI);
+ return TargetInstrInfo::isPredicable(MI);
}
}
@@ -928,17 +934,17 @@ bool
R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
MachineOperand &MO = Cond[1];
switch (MO.getImm()) {
- case AMDGPU::PRED_SETE_INT:
- MO.setImm(AMDGPU::PRED_SETNE_INT);
+ case R600::PRED_SETE_INT:
+ MO.setImm(R600::PRED_SETNE_INT);
break;
- case AMDGPU::PRED_SETNE_INT:
- MO.setImm(AMDGPU::PRED_SETE_INT);
+ case R600::PRED_SETNE_INT:
+ MO.setImm(R600::PRED_SETE_INT);
break;
- case AMDGPU::PRED_SETE:
- MO.setImm(AMDGPU::PRED_SETNE);
+ case R600::PRED_SETE:
+ MO.setImm(R600::PRED_SETNE);
break;
- case AMDGPU::PRED_SETNE:
- MO.setImm(AMDGPU::PRED_SETE);
+ case R600::PRED_SETNE:
+ MO.setImm(R600::PRED_SETE);
break;
default:
return true;
@@ -946,11 +952,11 @@ R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) con
MachineOperand &MO2 = Cond[2];
switch (MO2.getReg()) {
- case AMDGPU::PRED_SEL_ZERO:
- MO2.setReg(AMDGPU::PRED_SEL_ONE);
+ case R600::PRED_SEL_ZERO:
+ MO2.setReg(R600::PRED_SEL_ONE);
break;
- case AMDGPU::PRED_SEL_ONE:
- MO2.setReg(AMDGPU::PRED_SEL_ZERO);
+ case R600::PRED_SEL_ONE:
+ MO2.setReg(R600::PRED_SEL_ZERO);
break;
default:
return true;
@@ -967,22 +973,22 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
ArrayRef<MachineOperand> Pred) const {
int PIdx = MI.findFirstPredOperandIdx();
- if (MI.getOpcode() == AMDGPU::CF_ALU) {
+ if (MI.getOpcode() == R600::CF_ALU) {
MI.getOperand(8).setImm(0);
return true;
}
- if (MI.getOpcode() == AMDGPU::DOT_4) {
- MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_X))
+ if (MI.getOpcode() == R600::DOT_4) {
+ MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_X))
.setReg(Pred[2].getReg());
- MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Y))
+ MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_Y))
.setReg(Pred[2].getReg());
- MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Z))
+ MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_Z))
.setReg(Pred[2].getReg());
- MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_W))
+ MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_W))
.setReg(Pred[2].getReg());
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
- MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+ MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
return true;
}
@@ -990,7 +996,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
MachineOperand &PMO = MI.getOperand(PIdx);
PMO.setReg(Pred[2].getReg());
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
- MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+ MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
return true;
}
@@ -1020,20 +1026,20 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
default: {
MachineBasicBlock *MBB = MI.getParent();
int OffsetOpIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::addr);
+ R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::addr);
// addr is a custom operand with multiple MI operands, and only the
// first MI operand is given a name.
int RegOpIdx = OffsetOpIdx + 1;
int ChanOpIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::chan);
+ R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::chan);
if (isRegisterLoad(MI)) {
int DstOpIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+ R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::dst);
unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
unsigned Address = calculateIndirectAddress(RegIndex, Channel);
unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
- if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+ if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(),
getIndirectAddrRegClass()->getRegister(Address));
} else {
@@ -1042,12 +1048,12 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
} else if (isRegisterStore(MI)) {
int ValOpIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::val);
+ R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::val);
unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
unsigned Address = calculateIndirectAddress(RegIndex, Channel);
unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
- if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+ if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
MI.getOperand(ValOpIdx).getReg());
} else {
@@ -1062,15 +1068,15 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MBB->erase(MI);
return true;
}
- case AMDGPU::R600_EXTRACT_ELT_V2:
- case AMDGPU::R600_EXTRACT_ELT_V4:
+ case R600::R600_EXTRACT_ELT_V2:
+ case R600::R600_EXTRACT_ELT_V4:
buildIndirectRead(MI.getParent(), MI, MI.getOperand(0).getReg(),
RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address
MI.getOperand(2).getReg(),
RI.getHWRegChan(MI.getOperand(1).getReg()));
break;
- case AMDGPU::R600_INSERT_ELT_V2:
- case AMDGPU::R600_INSERT_ELT_V4:
+ case R600::R600_INSERT_ELT_V2:
+ case R600::R600_INSERT_ELT_V4:
buildIndirectWrite(MI.getParent(), MI, MI.getOperand(2).getReg(), // Value
RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address
MI.getOperand(3).getReg(), // Offset
@@ -1082,7 +1088,8 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
- const MachineFunction &MF) const {
+ const MachineFunction &MF,
+ const R600RegisterInfo &TRI) const {
const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
const R600FrameLowering *TFL = ST.getFrameLowering();
@@ -1093,17 +1100,15 @@ void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
return;
for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
- unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
- Reserved.set(SuperReg);
for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
- unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
- Reserved.set(Reg);
+ unsigned Reg = R600::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
+ TRI.reserveRegisterTuples(Reserved, Reg);
}
}
}
const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
- return &AMDGPU::R600_TReg32_XRegClass;
+ return &R600::R600_TReg32_XRegClass;
}
MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
@@ -1121,20 +1126,20 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
unsigned AddrReg;
switch (AddrChan) {
default: llvm_unreachable("Invalid Channel");
- case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
- case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
- case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
- case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+ case 0: AddrReg = R600::R600_AddrRegClass.getRegister(Address); break;
+ case 1: AddrReg = R600::R600_Addr_YRegClass.getRegister(Address); break;
+ case 2: AddrReg = R600::R600_Addr_ZRegClass.getRegister(Address); break;
+ case 3: AddrReg = R600::R600_Addr_WRegClass.getRegister(Address); break;
}
- MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
- AMDGPU::AR_X, OffsetReg);
- setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
+ MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, R600::MOVA_INT_eg,
+ R600::AR_X, OffsetReg);
+ setImmOperand(*MOVA, R600::OpName::write, 0);
- MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+ MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, R600::MOV,
AddrReg, ValueReg)
- .addReg(AMDGPU::AR_X,
+ .addReg(R600::AR_X,
RegState::Implicit | RegState::Kill);
- setImmOperand(*Mov, AMDGPU::OpName::dst_rel, 1);
+ setImmOperand(*Mov, R600::OpName::dst_rel, 1);
return Mov;
}
@@ -1153,21 +1158,21 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
unsigned AddrReg;
switch (AddrChan) {
default: llvm_unreachable("Invalid Channel");
- case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
- case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
- case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
- case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+ case 0: AddrReg = R600::R600_AddrRegClass.getRegister(Address); break;
+ case 1: AddrReg = R600::R600_Addr_YRegClass.getRegister(Address); break;
+ case 2: AddrReg = R600::R600_Addr_ZRegClass.getRegister(Address); break;
+ case 3: AddrReg = R600::R600_Addr_WRegClass.getRegister(Address); break;
}
- MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
- AMDGPU::AR_X,
+ MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, R600::MOVA_INT_eg,
+ R600::AR_X,
OffsetReg);
- setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
- MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+ setImmOperand(*MOVA, R600::OpName::write, 0);
+ MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, R600::MOV,
ValueReg,
AddrReg)
- .addReg(AMDGPU::AR_X,
+ .addReg(R600::AR_X,
RegState::Implicit | RegState::Kill);
- setImmOperand(*Mov, AMDGPU::OpName::src0_rel, 1);
+ setImmOperand(*Mov, R600::OpName::src0_rel, 1);
return Mov;
}
@@ -1265,7 +1270,7 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
//XXX: The r600g finalizer expects this to be 1, once we've moved the
//scheduling to the backend, we can change the default to 0.
MIB.addImm(1) // $last
- .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
+ .addReg(R600::PRED_SEL_OFF) // $pred_sel
.addImm(0) // $literal
.addImm(0); // $bank_swizzle
@@ -1286,23 +1291,23 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
static unsigned getSlotedOps(unsigned Op, unsigned Slot) {
switch (Op) {
- OPERAND_CASE(AMDGPU::OpName::update_exec_mask)
- OPERAND_CASE(AMDGPU::OpName::update_pred)
- OPERAND_CASE(AMDGPU::OpName::write)
- OPERAND_CASE(AMDGPU::OpName::omod)
- OPERAND_CASE(AMDGPU::OpName::dst_rel)
- OPERAND_CASE(AMDGPU::OpName::clamp)
- OPERAND_CASE(AMDGPU::OpName::src0)
- OPERAND_CASE(AMDGPU::OpName::src0_neg)
- OPERAND_CASE(AMDGPU::OpName::src0_rel)
- OPERAND_CASE(AMDGPU::OpName::src0_abs)
- OPERAND_CASE(AMDGPU::OpName::src0_sel)
- OPERAND_CASE(AMDGPU::OpName::src1)
- OPERAND_CASE(AMDGPU::OpName::src1_neg)
- OPERAND_CASE(AMDGPU::OpName::src1_rel)
- OPERAND_CASE(AMDGPU::OpName::src1_abs)
- OPERAND_CASE(AMDGPU::OpName::src1_sel)
- OPERAND_CASE(AMDGPU::OpName::pred_sel)
+ OPERAND_CASE(R600::OpName::update_exec_mask)
+ OPERAND_CASE(R600::OpName::update_pred)
+ OPERAND_CASE(R600::OpName::write)
+ OPERAND_CASE(R600::OpName::omod)
+ OPERAND_CASE(R600::OpName::dst_rel)
+ OPERAND_CASE(R600::OpName::clamp)
+ OPERAND_CASE(R600::OpName::src0)
+ OPERAND_CASE(R600::OpName::src0_neg)
+ OPERAND_CASE(R600::OpName::src0_rel)
+ OPERAND_CASE(R600::OpName::src0_abs)
+ OPERAND_CASE(R600::OpName::src0_sel)
+ OPERAND_CASE(R600::OpName::src1)
+ OPERAND_CASE(R600::OpName::src1_neg)
+ OPERAND_CASE(R600::OpName::src1_rel)
+ OPERAND_CASE(R600::OpName::src1_abs)
+ OPERAND_CASE(R600::OpName::src1_sel)
+ OPERAND_CASE(R600::OpName::pred_sel)
default:
llvm_unreachable("Wrong Operand");
}
@@ -1313,39 +1318,39 @@ static unsigned getSlotedOps(unsigned Op, unsigned Slot) {
MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
const {
- assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
+ assert (MI->getOpcode() == R600::DOT_4 && "Not Implemented");
unsigned Opcode;
- if (ST.getGeneration() <= R600Subtarget::R700)
- Opcode = AMDGPU::DOT4_r600;
+ if (ST.getGeneration() <= AMDGPUSubtarget::R700)
+ Opcode = R600::DOT4_r600;
else
- Opcode = AMDGPU::DOT4_eg;
+ Opcode = R600::DOT4_eg;
MachineBasicBlock::iterator I = MI;
MachineOperand &Src0 = MI->getOperand(
- getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot)));
+ getOperandIdx(MI->getOpcode(), getSlotedOps(R600::OpName::src0, Slot)));
MachineOperand &Src1 = MI->getOperand(
- getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot)));
+ getOperandIdx(MI->getOpcode(), getSlotedOps(R600::OpName::src1, Slot)));
MachineInstr *MIB = buildDefaultInstruction(
MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
static const unsigned Operands[14] = {
- AMDGPU::OpName::update_exec_mask,
- AMDGPU::OpName::update_pred,
- AMDGPU::OpName::write,
- AMDGPU::OpName::omod,
- AMDGPU::OpName::dst_rel,
- AMDGPU::OpName::clamp,
- AMDGPU::OpName::src0_neg,
- AMDGPU::OpName::src0_rel,
- AMDGPU::OpName::src0_abs,
- AMDGPU::OpName::src0_sel,
- AMDGPU::OpName::src1_neg,
- AMDGPU::OpName::src1_rel,
- AMDGPU::OpName::src1_abs,
- AMDGPU::OpName::src1_sel,
+ R600::OpName::update_exec_mask,
+ R600::OpName::update_pred,
+ R600::OpName::write,
+ R600::OpName::omod,
+ R600::OpName::dst_rel,
+ R600::OpName::clamp,
+ R600::OpName::src0_neg,
+ R600::OpName::src0_rel,
+ R600::OpName::src0_abs,
+ R600::OpName::src0_sel,
+ R600::OpName::src1_neg,
+ R600::OpName::src1_rel,
+ R600::OpName::src1_abs,
+ R600::OpName::src1_sel,
};
MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
- getSlotedOps(AMDGPU::OpName::pred_sel, Slot)));
- MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel))
+ getSlotedOps(R600::OpName::pred_sel, Slot)));
+ MIB->getOperand(getOperandIdx(Opcode, R600::OpName::pred_sel))
.setReg(MO.getReg());
for (unsigned i = 0; i < 14; i++) {
@@ -1362,16 +1367,16 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
MachineBasicBlock::iterator I,
unsigned DstReg,
uint64_t Imm) const {
- MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
- AMDGPU::ALU_LITERAL_X);
- setImmOperand(*MovImm, AMDGPU::OpName::literal, Imm);
+ MachineInstr *MovImm = buildDefaultInstruction(BB, I, R600::MOV, DstReg,
+ R600::ALU_LITERAL_X);
+ setImmOperand(*MovImm, R600::OpName::literal, Imm);
return MovImm;
}
MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
unsigned DstReg, unsigned SrcReg) const {
- return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg);
+ return buildDefaultInstruction(*MBB, I, R600::MOV, DstReg, SrcReg);
}
int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
@@ -1379,7 +1384,7 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
}
int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const {
- return AMDGPU::getNamedOperandIdx(Opcode, Op);
+ return R600::getNamedOperandIdx(Opcode, Op);
}
void R600InstrInfo::setImmOperand(MachineInstr &MI, unsigned Op,
@@ -1406,25 +1411,25 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx,
bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
switch (Flag) {
case MO_FLAG_CLAMP:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::clamp);
+ FlagIndex = getOperandIdx(MI, R600::OpName::clamp);
break;
case MO_FLAG_MASK:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::write);
+ FlagIndex = getOperandIdx(MI, R600::OpName::write);
break;
case MO_FLAG_NOT_LAST:
case MO_FLAG_LAST:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::last);
+ FlagIndex = getOperandIdx(MI, R600::OpName::last);
break;
case MO_FLAG_NEG:
switch (SrcIdx) {
case 0:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_neg);
+ FlagIndex = getOperandIdx(MI, R600::OpName::src0_neg);
break;
case 1:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_neg);
+ FlagIndex = getOperandIdx(MI, R600::OpName::src1_neg);
break;
case 2:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src2_neg);
+ FlagIndex = getOperandIdx(MI, R600::OpName::src2_neg);
break;
}
break;
@@ -1435,10 +1440,10 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx,
(void)IsOP3;
switch (SrcIdx) {
case 0:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_abs);
+ FlagIndex = getOperandIdx(MI, R600::OpName::src0_abs);
break;
case 1:
- FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_abs);
+ FlagIndex = getOperandIdx(MI, R600::OpName::src1_abs);
break;
}
break;
@@ -1499,15 +1504,15 @@ unsigned R600InstrInfo::getAddressSpaceForPseudoSourceKind(
switch (Kind) {
case PseudoSourceValue::Stack:
case PseudoSourceValue::FixedStack:
- return AMDGPUASI.PRIVATE_ADDRESS;
+ return ST.getAMDGPUAS().PRIVATE_ADDRESS;
case PseudoSourceValue::ConstantPool:
case PseudoSourceValue::GOT:
case PseudoSourceValue::JumpTable:
case PseudoSourceValue::GlobalValueCallEntry:
case PseudoSourceValue::ExternalSymbolCallEntry:
case PseudoSourceValue::TargetCustom:
- return AMDGPUASI.CONSTANT_ADDRESS;
+ return ST.getAMDGPUAS().CONSTANT_ADDRESS;
}
llvm_unreachable("Invalid pseudo source kind");
- return AMDGPUASI.PRIVATE_ADDRESS;
+ return ST.getAMDGPUAS().PRIVATE_ADDRESS;
}
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index abaa37450758..7a3dece31665 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -8,15 +8,18 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface definition for R600InstrInfo
+/// Interface definition for R600InstrInfo
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
#define LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
-#include "AMDGPUInstrInfo.h"
#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "R600GenInstrInfo.inc"
namespace llvm {
@@ -34,7 +37,7 @@ class MachineInstr;
class MachineInstrBuilder;
class R600Subtarget;
-class R600InstrInfo final : public AMDGPUInstrInfo {
+class R600InstrInfo final : public R600GenInstrInfo {
private:
const R600RegisterInfo RI;
const R600Subtarget &ST;
@@ -150,7 +153,7 @@ public:
/// Same but using const index set instead of MI set.
bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
- /// \brief Vector instructions are instructions that must fill all
+ /// Vector instructions are instructions that must fill all
/// instruction slots within an instruction group.
bool isVector(const MachineInstr &MI) const;
@@ -209,9 +212,10 @@ public:
bool expandPostRAPseudo(MachineInstr &MI) const override;
- /// \brief Reserve the registers that may be accesed using indirect addressing.
+ /// Reserve the registers that may be accesed using indirect addressing.
void reserveIndirectRegisters(BitVector &Reserved,
- const MachineFunction &MF) const;
+ const MachineFunction &MF,
+ const R600RegisterInfo &TRI) const;
/// Calculate the "Indirect Address" for the given \p RegIndex and
/// \p Channel
@@ -235,7 +239,7 @@ public:
/// read or write or -1 if indirect addressing is not used by this program.
int getIndirectIndexEnd(const MachineFunction &MF) const;
- /// \brief Build instruction(s) for an indirect register write.
+ /// Build instruction(s) for an indirect register write.
///
/// \returns The instruction that performs the indirect register write
MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
@@ -243,7 +247,7 @@ public:
unsigned ValueReg, unsigned Address,
unsigned OffsetReg) const;
- /// \brief Build instruction(s) for an indirect register read.
+ /// Build instruction(s) for an indirect register read.
///
/// \returns The instruction that performs the indirect register read
MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
@@ -281,23 +285,23 @@ public:
MachineBasicBlock::iterator I,
unsigned DstReg, unsigned SrcReg) const;
- /// \brief Get the index of Op in the MachineInstr.
+ /// Get the index of Op in the MachineInstr.
///
/// \returns -1 if the Instruction does not contain the specified \p Op.
int getOperandIdx(const MachineInstr &MI, unsigned Op) const;
- /// \brief Get the index of \p Op for the given Opcode.
+ /// Get the index of \p Op for the given Opcode.
///
/// \returns -1 if the Instruction does not contain the specified \p Op.
int getOperandIdx(unsigned Opcode, unsigned Op) const;
- /// \brief Helper function for setting instruction flag values.
+ /// Helper function for setting instruction flag values.
void setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const;
- ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
+ ///Add one of the MO_FLAG* flags to the specified \p Operand.
void addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
- ///\brief Determine if the specified \p Flag is set on this \p Operand.
+ ///Determine if the specified \p Flag is set on this \p Operand.
bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
/// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
@@ -307,7 +311,7 @@ public:
MachineOperand &getFlagOp(MachineInstr &MI, unsigned SrcIdx = 0,
unsigned Flag = 0) const;
- /// \brief Clear the specified flag on the instruction.
+ /// Clear the specified flag on the instruction.
void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
// Helper functions that check the opcode for status information
@@ -323,7 +327,7 @@ public:
PseudoSourceValue::PSVKind Kind) const override;
};
-namespace AMDGPU {
+namespace R600 {
int getLDSNoRetOp(uint16_t Opcode);
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 801e4e61fca6..7bf174f4cd86 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -12,20 +12,19 @@
//
//===----------------------------------------------------------------------===//
-include "R600Intrinsics.td"
include "R600InstrFormats.td"
// FIXME: Should not be arbitrarily split from other R600 inst classes.
class R600WrapperInst <dag outs, dag ins, string asm = "", list<dag> pattern = []> :
AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
let SubtargetPredicate = isR600toCayman;
+ let Namespace = "R600";
}
class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern = []> :
InstR600 <outs, ins, asm, pattern, NullALU> {
- let Namespace = "AMDGPU";
}
def MEMxi : Operand<iPTR> {
@@ -81,11 +80,18 @@ def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
(ops PRED_SEL_OFF)>;
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
+ usesCustomInserter = 1, Namespace = "R600" in {
+ def RETURN : ILFormat<(outs), (ins variable_ops),
+ "RETURN", [(AMDGPUendpgm)]
+ >;
+}
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
@@ -219,34 +225,6 @@ class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
-def TEX_SHADOW : PatLeaf<
- (imm),
- [{uint32_t TType = (uint32_t)N->getZExtValue();
- return (TType >= 6 && TType <= 8) || TType == 13;
- }]
->;
-
-def TEX_RECT : PatLeaf<
- (imm),
- [{uint32_t TType = (uint32_t)N->getZExtValue();
- return TType == 5;
- }]
->;
-
-def TEX_ARRAY : PatLeaf<
- (imm),
- [{uint32_t TType = (uint32_t)N->getZExtValue();
- return TType == 9 || TType == 10 || TType == 16;
- }]
->;
-
-def TEX_SHADOW_ARRAY : PatLeaf<
- (imm),
- [{uint32_t TType = (uint32_t)N->getZExtValue();
- return TType == 11 || TType == 12 || TType == 17;
- }]
->;
-
class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
dag outs, dag ins, string asm, list<dag> pattern> :
InstR600ISA <outs, ins, asm, pattern>,
@@ -357,6 +335,8 @@ def vtx_id2_load : LoadVtxId2 <load>;
// R600 SDNodes
//===----------------------------------------------------------------------===//
+let Namespace = "R600" in {
+
def INTERP_PAIR_XY : AMDGPUShaderInst <
(outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
(ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
@@ -369,6 +349,8 @@ def INTERP_PAIR_ZW : AMDGPUShaderInst <
"INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
[]>;
+}
+
def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
[SDNPVariadic]
@@ -416,11 +398,15 @@ def : R600Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
// Interpolation Instructions
//===----------------------------------------------------------------------===//
+let Namespace = "R600" in {
+
def INTERP_VEC_LOAD : AMDGPUShaderInst <
(outs R600_Reg128:$dst),
(ins i32imm:$src0),
"INTERP_LOAD $src0 : $dst">;
+}
+
def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
let bank_swizzle = 5;
}
@@ -660,14 +646,7 @@ def PAD : R600WrapperInst <(outs), (ins), "PAD", [] > {
let isCodeGenOnly = 1, isPseudo = 1 in {
-let usesCustomInserter = 1 in {
-
-class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
- (outs rc:$dst),
- (ins rc:$src0),
- "CLAMP $dst, $src0",
- [(set f32:$dst, (AMDGPUclamp f32:$src0))]
->;
+let Namespace = "R600", usesCustomInserter = 1 in {
class FABS <RegisterClass rc> : AMDGPUShaderInst <
(outs rc:$dst),
@@ -799,7 +778,9 @@ class MOV_IMM <ValueType vt, Operand immType> : R600WrapperInst <
(ins immType:$imm),
"",
[]
->;
+> {
+ let Namespace = "R600";
+}
} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
@@ -1014,7 +995,7 @@ class CNDGE_Common <bits<5> inst> : R600_3OP <
}
-let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in {
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "R600" in {
class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
// Slot X
UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
@@ -1193,7 +1174,6 @@ class COS_Common <bits<11> inst> : R600_1OP <
let Itinerary = TransALU;
}
-def CLAMP_R600 : CLAMP <R600_Reg32>;
def FABS_R600 : FABS<R600_Reg32>;
def FNEG_R600 : FNEG<R600_Reg32>;
@@ -1334,7 +1314,9 @@ let Predicates = [isR600] in {
// Regist loads and stores - for indirect addressing
//===----------------------------------------------------------------------===//
+let Namespace = "R600" in {
defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
+}
// Hardcode channel to 0
// NOTE: LSHR is not available here. LSHR is per family instruction
@@ -1386,11 +1368,12 @@ let usesCustomInserter = 1 in {
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
-def MASK_WRITE : AMDGPUShaderInst <
+def MASK_WRITE : InstR600 <
(outs),
(ins R600_Reg32:$src),
"MASK_WRITE $src",
- []
+ [],
+ NullALU
>;
} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
@@ -1421,7 +1404,7 @@ def TXD_SHADOW: InstR600 <
// Constant Buffer Addressing Support
//===----------------------------------------------------------------------===//
-let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in {
+let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "R600" in {
def CONST_COPY : Instruction {
let OutOperandList = (outs R600_Reg32:$dst);
let InOperandList = (ins i32imm:$src);
@@ -1544,23 +1527,6 @@ let Inst{63-32} = Word1;
//===---------------------------------------------------------------------===//
// Flow and Program control Instructions
//===---------------------------------------------------------------------===//
-class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
-: Instruction {
-
- let Namespace = "AMDGPU";
- dag OutOperandList = outs;
- dag InOperandList = ins;
- let Pattern = pattern;
- let AsmString = !strconcat(asmstr, "\n");
- let isPseudo = 1;
- let Itinerary = NullALU;
- bit hasIEEEFlag = 0;
- bit hasZeroOpFlag = 0;
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let isCodeGenOnly = 1;
-}
multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
def _i32 : ILFormat<(outs),
@@ -1592,23 +1558,14 @@ multiclass BranchInstr2<string name> {
// Custom Inserter for Branches and returns, this eventually will be a
// separate pass
//===---------------------------------------------------------------------===//
-let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
+let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1,
+ Namespace = "R600" in {
def BRANCH : ILFormat<(outs), (ins brtarget:$target),
"; Pseudo unconditional branch instruction",
[(br bb:$target)]>;
defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>;
}
-//===---------------------------------------------------------------------===//
-// Return instruction
-//===---------------------------------------------------------------------===//
-let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
- usesCustomInserter = 1 in {
- def RETURN : ILFormat<(outs), (ins variable_ops),
- "RETURN", [(AMDGPUendpgm)]
- >;
-}
-
//===----------------------------------------------------------------------===//
// Branch Instructions
//===----------------------------------------------------------------------===//
@@ -1738,13 +1695,8 @@ def : R600Pat <
>;
// KIL Patterns
-def KILP : R600Pat <
- (int_AMDGPU_kilp),
- (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
->;
-
def KIL : R600Pat <
- (int_AMDGPU_kill f32:$src0),
+ (int_r600_kill f32:$src0),
(MASK_WRITE (KILLGT (f32 ZERO), $src0))
>;
diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td
deleted file mode 100644
index 4c9e1e8a5434..000000000000
--- a/lib/Target/AMDGPU/R600Intrinsics.td
+++ /dev/null
@@ -1,67 +0,0 @@
-//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// R600 Intrinsic Definitions
-//
-//===----------------------------------------------------------------------===//
-
-class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [
- llvm_v4f32_ty, // Coord
- llvm_i32_ty, // offset_x
- llvm_i32_ty, // offset_y,
- llvm_i32_ty, // offset_z,
- llvm_i32_ty, // resource_id
- llvm_i32_ty, // samplerid
- llvm_i32_ty, // coord_type_x
- llvm_i32_ty, // coord_type_y
- llvm_i32_ty, // coord_type_z
- llvm_i32_ty], // coord_type_w
- [IntrNoMem]
->;
-
-class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [
- llvm_v4i32_ty, // Coord
- llvm_i32_ty, // offset_x
- llvm_i32_ty, // offset_y,
- llvm_i32_ty, // offset_z,
- llvm_i32_ty, // resource_id
- llvm_i32_ty, // samplerid
- llvm_i32_ty, // coord_type_x
- llvm_i32_ty, // coord_type_y
- llvm_i32_ty, // coord_type_z
- llvm_i32_ty], // coord_type_w
- [IntrNoMem]
->;
-
-let TargetPrefix = "r600", isTarget = 1 in {
-
-def int_r600_store_swizzle :
- Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []
->;
-
-def int_r600_store_stream_output : Intrinsic<
- [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
->;
-
-def int_r600_tex : TextureIntrinsicFloatInput;
-def int_r600_texc : TextureIntrinsicFloatInput;
-def int_r600_txl : TextureIntrinsicFloatInput;
-def int_r600_txlc : TextureIntrinsicFloatInput;
-def int_r600_txb : TextureIntrinsicFloatInput;
-def int_r600_txbc : TextureIntrinsicFloatInput;
-def int_r600_txf : TextureIntrinsicInt32Input;
-def int_r600_txq : TextureIntrinsicInt32Input;
-def int_r600_ddx : TextureIntrinsicFloatInput;
-def int_r600_ddy : TextureIntrinsicFloatInput;
-
-def int_r600_dot4 : Intrinsic<[llvm_float_ty],
- [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
->;
-
-} // End TargetPrefix = "r600", isTarget = 1
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index a7e540f9d14d..a1429a2ac50f 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -8,13 +8,14 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief R600 Machine Scheduler interface
+/// R600 Machine Scheduler interface
//
//===----------------------------------------------------------------------===//
#include "R600MachineScheduler.h"
#include "AMDGPUSubtarget.h"
#include "R600InstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/Pass.h"
@@ -78,7 +79,7 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
AllowSwitchFromAlu = true;
} else {
unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
- DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+ LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n");
// We assume the local GPR requirements to be "dominated" by the requirement
// of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
// after TEX are indeed likely to consume or generate values from/for the
@@ -124,26 +125,24 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
NextInstKind = IDOther;
}
- DEBUG(
- if (SU) {
- dbgs() << " ** Pick node **\n";
- SU->dump(DAG);
- } else {
- dbgs() << "NO NODE \n";
- for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
- const SUnit &S = DAG->SUnits[i];
- if (!S.isScheduled)
- S.dump(DAG);
- }
- }
- );
+ LLVM_DEBUG(if (SU) {
+ dbgs() << " ** Pick node **\n";
+ SU->dump(DAG);
+ } else {
+ dbgs() << "NO NODE \n";
+ for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
+ const SUnit &S = DAG->SUnits[i];
+ if (!S.isScheduled)
+ S.dump(DAG);
+ }
+ });
return SU;
}
void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (NextInstKind != CurInstKind) {
- DEBUG(dbgs() << "Instruction Type Switch\n");
+ LLVM_DEBUG(dbgs() << "Instruction Type Switch\n");
if (NextInstKind != IDAlu)
OccupedSlotsMask |= 31;
CurEmitted = 0;
@@ -163,7 +162,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
E = SU->getInstr()->operands_end(); It != E; ++It) {
MachineOperand &MO = *It;
- if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+ if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
++CurEmitted;
}
}
@@ -172,8 +171,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
++CurEmitted;
}
-
- DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
+ LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
if (CurInstKind != IDFetch) {
MoveUnits(Pending[IDFetch], Available[IDFetch]);
@@ -183,18 +181,18 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
static bool
isPhysicalRegCopy(MachineInstr *MI) {
- if (MI->getOpcode() != AMDGPU::COPY)
+ if (MI->getOpcode() != R600::COPY)
return false;
return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
}
void R600SchedStrategy::releaseTopNode(SUnit *SU) {
- DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
+ LLVM_DEBUG(dbgs() << "Top Releasing "; SU->dump(DAG););
}
void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
- DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
+ LLVM_DEBUG(dbgs() << "Bottom Releasing "; SU->dump(DAG););
if (isPhysicalRegCopy(SU->getInstr())) {
PhysicalRegCopy.push_back(SU);
return;
@@ -226,14 +224,14 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
return AluTrans;
switch (MI->getOpcode()) {
- case AMDGPU::PRED_X:
+ case R600::PRED_X:
return AluPredX;
- case AMDGPU::INTERP_PAIR_XY:
- case AMDGPU::INTERP_PAIR_ZW:
- case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::DOT_4:
+ case R600::INTERP_PAIR_XY:
+ case R600::INTERP_PAIR_ZW:
+ case R600::INTERP_VEC_LOAD:
+ case R600::DOT_4:
return AluT_XYZW;
- case AMDGPU::COPY:
+ case R600::COPY:
if (MI->getOperand(1).isUndef()) {
// MI will become a KILL, don't considers it in scheduling
return AluDiscarded;
@@ -248,7 +246,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
if(TII->isVector(*MI) ||
TII->isCubeOp(MI->getOpcode()) ||
TII->isReductionOp(MI->getOpcode()) ||
- MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
+ MI->getOpcode() == R600::GROUP_BARRIER) {
return AluT_XYZW;
}
@@ -259,13 +257,13 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
// Is the result already assigned to a channel ?
unsigned DestSubReg = MI->getOperand(0).getSubReg();
switch (DestSubReg) {
- case AMDGPU::sub0:
+ case R600::sub0:
return AluT_X;
- case AMDGPU::sub1:
+ case R600::sub1:
return AluT_Y;
- case AMDGPU::sub2:
+ case R600::sub2:
return AluT_Z;
- case AMDGPU::sub3:
+ case R600::sub3:
return AluT_W;
default:
break;
@@ -273,16 +271,16 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
// Is the result already member of a X/Y/Z/W class ?
unsigned DestReg = MI->getOperand(0).getReg();
- if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
- regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
+ if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
+ regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
return AluT_X;
- if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
+ if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass))
return AluT_Y;
- if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
+ if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass))
return AluT_Z;
- if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
+ if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass))
return AluT_W;
- if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
+ if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass))
return AluT_XYZW;
// LDS src registers cannot be used in the Trans slot.
@@ -303,13 +301,13 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
}
switch (Opcode) {
- case AMDGPU::PRED_X:
- case AMDGPU::COPY:
- case AMDGPU::CONST_COPY:
- case AMDGPU::INTERP_PAIR_XY:
- case AMDGPU::INTERP_PAIR_ZW:
- case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::DOT_4:
+ case R600::PRED_X:
+ case R600::COPY:
+ case R600::CONST_COPY:
+ case R600::INTERP_PAIR_XY:
+ case R600::INTERP_PAIR_ZW:
+ case R600::INTERP_VEC_LOAD:
+ case R600::DOT_4:
return IDAlu;
default:
return IDOther;
@@ -345,17 +343,17 @@ void R600SchedStrategy::LoadAlu() {
}
void R600SchedStrategy::PrepareNextSlot() {
- DEBUG(dbgs() << "New Slot\n");
+ LLVM_DEBUG(dbgs() << "New Slot\n");
assert (OccupedSlotsMask && "Slot wasn't filled");
OccupedSlotsMask = 0;
-// if (HwGen == R600Subtarget::NORTHERN_ISLANDS)
+// if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
// OccupedSlotsMask |= 16;
InstructionsGroupCandidate.clear();
LoadAlu();
}
void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
- int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+ int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst);
if (DstIndex == -1) {
return;
}
@@ -372,16 +370,16 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
// Constrains the regclass of DestReg to assign it to Slot
switch (Slot) {
case 0:
- MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
+ MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass);
break;
case 1:
- MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
+ MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass);
break;
case 2:
- MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
+ MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass);
break;
case 3:
- MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
+ MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass);
break;
}
}
@@ -461,7 +459,7 @@ SUnit* R600SchedStrategy::pickOther(int QID) {
}
if (!AQ.empty()) {
SU = AQ.back();
- AQ.resize(AQ.size() - 1);
+ AQ.pop_back();
}
return SU;
}
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h
index 9a6770570477..8a9a8d3d1e23 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.h
+++ b/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief R600 Machine Scheduler interface
+/// R600 Machine Scheduler interface
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index cd71f19760b9..7de5e2c9577d 100644
--- a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
+++ b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -1,4 +1,4 @@
-//===- AMDGPUOpenCLImageTypeLoweringPass.cpp ------------------------------===//
+//===- R600OpenCLImageTypeLoweringPass.cpp ------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -153,7 +153,7 @@ PushArgMD(KernelArgMD &MD, const MDVector &V) {
namespace {
-class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
+class R600OpenCLImageTypeLoweringPass : public ModulePass {
static char ID;
LLVMContext *Context;
@@ -364,7 +364,7 @@ class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
}
public:
- AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {}
+ R600OpenCLImageTypeLoweringPass() : ModulePass(ID) {}
bool runOnModule(Module &M) override {
Context = &M.getContext();
@@ -376,14 +376,14 @@ public:
}
StringRef getPassName() const override {
- return "AMDGPU OpenCL Image Type Pass";
+ return "R600 OpenCL Image Type Pass";
}
};
} // end anonymous namespace
-char AMDGPUOpenCLImageTypeLoweringPass::ID = 0;
+char R600OpenCLImageTypeLoweringPass::ID = 0;
-ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() {
- return new AMDGPUOpenCLImageTypeLoweringPass();
+ModulePass *llvm::createR600OpenCLImageTypeLoweringPass() {
+ return new R600OpenCLImageTypeLoweringPass();
}
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 4a14d95f1cc4..692451cb8fe0 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -31,6 +31,7 @@
#include "AMDGPUSubtarget.h"
#include "R600Defines.h"
#include "R600InstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
@@ -78,7 +79,7 @@ public:
std::vector<unsigned> UndefReg;
RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
- assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE);
+ assert(MI->getOpcode() == R600::REG_SEQUENCE);
for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
MachineOperand &MO = Instr->getOperand(i);
unsigned Chan = Instr->getOperand(i + 1).getImm();
@@ -158,8 +159,8 @@ bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
return true;
switch (MI.getOpcode()) {
- case AMDGPU::R600_ExportSwz:
- case AMDGPU::EG_ExportSwz:
+ case R600::R600_ExportSwz:
+ case R600::EG_ExportSwz:
return true;
default:
return false;
@@ -212,12 +213,12 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
E = RSI->RegToChan.end(); It != E; ++It) {
- unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+ unsigned DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
unsigned SubReg = (*It).first;
unsigned Swizzle = (*It).second;
unsigned Chan = getReassignedChan(RemapChan, Swizzle);
- MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
+ MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(R600::INSERT_SUBREG),
DstReg)
.addReg(SrcVec)
.addReg(SubReg)
@@ -228,20 +229,20 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
UpdatedUndef.erase(ChanPos);
assert(!is_contained(UpdatedUndef, Chan) &&
"UpdatedUndef shouldn't contain Chan more than once!");
- DEBUG(dbgs() << " ->"; Tmp->dump(););
+ LLVM_DEBUG(dbgs() << " ->"; Tmp->dump(););
(void)Tmp;
SrcVec = DstReg;
}
MachineInstr *NewMI =
- BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg).addReg(SrcVec);
- DEBUG(dbgs() << " ->"; NewMI->dump(););
+ BuildMI(MBB, Pos, DL, TII->get(R600::COPY), Reg).addReg(SrcVec);
+ LLVM_DEBUG(dbgs() << " ->"; NewMI->dump(););
- DEBUG(dbgs() << " Updating Swizzle:\n");
+ LLVM_DEBUG(dbgs() << " Updating Swizzle:\n");
for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
E = MRI->use_instr_end(); It != E; ++It) {
- DEBUG(dbgs() << " ";(*It).dump(); dbgs() << " ->");
+ LLVM_DEBUG(dbgs() << " "; (*It).dump(); dbgs() << " ->");
SwizzleInput(*It, RemapChan);
- DEBUG((*It).dump());
+ LLVM_DEBUG((*It).dump());
}
RSI->Instr->eraseFromParent();
@@ -353,7 +354,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
MII != MIIE; ++MII) {
MachineInstr &MI = *MII;
- if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) {
+ if (MI.getOpcode() != R600::REG_SEQUENCE) {
if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
unsigned Reg = MI.getOperand(1).getReg();
for (MachineRegisterInfo::def_instr_iterator
@@ -372,14 +373,14 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
if (!areAllUsesSwizzeable(Reg))
continue;
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Trying to optimize ";
MI.dump();
});
RegSeqInfo CandidateRSI;
std::vector<std::pair<unsigned, unsigned>> RemapChan;
- DEBUG(dbgs() << "Using common slots...\n";);
+ LLVM_DEBUG(dbgs() << "Using common slots...\n";);
if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
// Remove CandidateRSI mapping
RemoveMI(CandidateRSI.Instr);
@@ -387,7 +388,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
trackRSI(RSI);
continue;
}
- DEBUG(dbgs() << "Using free slots...\n";);
+ LLVM_DEBUG(dbgs() << "Using free slots...\n";);
RemapChan.clear();
if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
RemoveMI(CandidateRSI.Instr);
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index 7340318d2d88..612c62b514fd 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -17,6 +17,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "R600InstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/DFAPacketizer.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -83,39 +84,39 @@ private:
LastDstChan = BISlot;
if (TII->isPredicated(*BI))
continue;
- int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
+ int OperandIdx = TII->getOperandIdx(BI->getOpcode(), R600::OpName::write);
if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
continue;
- int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst);
+ int DstIdx = TII->getOperandIdx(BI->getOpcode(), R600::OpName::dst);
if (DstIdx == -1) {
continue;
}
unsigned Dst = BI->getOperand(DstIdx).getReg();
if (isTrans || TII->isTransOnly(*BI)) {
- Result[Dst] = AMDGPU::PS;
+ Result[Dst] = R600::PS;
continue;
}
- if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
- BI->getOpcode() == AMDGPU::DOT4_eg) {
- Result[Dst] = AMDGPU::PV_X;
+ if (BI->getOpcode() == R600::DOT4_r600 ||
+ BI->getOpcode() == R600::DOT4_eg) {
+ Result[Dst] = R600::PV_X;
continue;
}
- if (Dst == AMDGPU::OQAP) {
+ if (Dst == R600::OQAP) {
continue;
}
unsigned PVReg = 0;
switch (TRI.getHWRegChan(Dst)) {
case 0:
- PVReg = AMDGPU::PV_X;
+ PVReg = R600::PV_X;
break;
case 1:
- PVReg = AMDGPU::PV_Y;
+ PVReg = R600::PV_Y;
break;
case 2:
- PVReg = AMDGPU::PV_Z;
+ PVReg = R600::PV_Z;
break;
case 3:
- PVReg = AMDGPU::PV_W;
+ PVReg = R600::PV_W;
break;
default:
llvm_unreachable("Invalid Chan");
@@ -128,9 +129,9 @@ private:
void substitutePV(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PVs)
const {
unsigned Ops[] = {
- AMDGPU::OpName::src0,
- AMDGPU::OpName::src1,
- AMDGPU::OpName::src2
+ R600::OpName::src0,
+ R600::OpName::src1,
+ R600::OpName::src2
};
for (unsigned i = 0; i < 3; i++) {
int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
@@ -170,7 +171,7 @@ public:
return true;
if (!TII->isALUInstr(MI.getOpcode()))
return true;
- if (MI.getOpcode() == AMDGPU::GROUP_BARRIER)
+ if (MI.getOpcode() == R600::GROUP_BARRIER)
return true;
// XXX: This can be removed once the packetizer properly handles all the
// LDS instruction group restrictions.
@@ -184,8 +185,8 @@ public:
if (getSlot(*MII) == getSlot(*MIJ))
ConsideredInstUsesAlreadyWrittenVectorElement = true;
// Does MII and MIJ share the same pred_sel ?
- int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
- OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel);
+ int OpI = TII->getOperandIdx(MII->getOpcode(), R600::OpName::pred_sel),
+ OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600::OpName::pred_sel);
unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
if (PredI != PredJ)
@@ -219,7 +220,7 @@ public:
}
void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
- unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
+ unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), R600::OpName::last);
MI->getOperand(LastOp).setImm(Bit);
}
@@ -236,7 +237,7 @@ public:
if (ConsideredInstUsesAlreadyWrittenVectorElement &&
!TII->isVectorOnly(MI) && VLIW5) {
isTransSlot = true;
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Considering as Trans Inst :";
MI.dump();
});
@@ -249,7 +250,7 @@ public:
// Are the Constants limitations met ?
CurrentPacketMIs.push_back(&MI);
if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Couldn't pack :\n";
MI.dump();
dbgs() << "with the following packets :\n";
@@ -266,7 +267,7 @@ public:
// Is there a BankSwizzle set that meet Read Port limitations ?
if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
PV, BS, isTransSlot)) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Couldn't pack :\n";
MI.dump();
dbgs() << "with the following packets :\n";
@@ -300,11 +301,11 @@ public:
for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
MachineInstr *MI = CurrentPacketMIs[i];
unsigned Op = TII->getOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::bank_swizzle);
+ R600::OpName::bank_swizzle);
MI->getOperand(Op).setImm(BS[i]);
}
unsigned Op =
- TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle);
+ TII->getOperandIdx(MI.getOpcode(), R600::OpName::bank_swizzle);
MI.getOperand(Op).setImm(BS.back());
if (!CurrentPacketMIs.empty())
setIsLastBit(CurrentPacketMIs.back(), 0);
@@ -333,6 +334,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
// DFA state table should not be empty.
assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+ assert(Packetizer.getResourceTracker()->getInstrItins());
if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty())
return false;
@@ -352,8 +354,8 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
MachineBasicBlock::iterator End = MBB->end();
MachineBasicBlock::iterator MI = MBB->begin();
while (MI != End) {
- if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF ||
- (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
+ if (MI->isKill() || MI->getOpcode() == R600::IMPLICIT_DEF ||
+ (MI->getOpcode() == R600::CF_ALU && !MI->getOperand(8).getImm())) {
MachineBasicBlock::iterator DeleteMI = MI;
++MI;
MBB->erase(DeleteMI);
diff --git a/lib/Target/AMDGPU/R600Processors.td b/lib/Target/AMDGPU/R600Processors.td
index 89194dc1bdf6..f39b3dc1bfd4 100644
--- a/lib/Target/AMDGPU/R600Processors.td
+++ b/lib/Target/AMDGPU/R600Processors.td
@@ -7,6 +7,62 @@
//
//===----------------------------------------------------------------------===//
+class SubtargetFeatureFetchLimit <string Value> :
+ SubtargetFeature <"fetch"#Value,
+ "TexVTXClauseSize",
+ Value,
+ "Limit the maximum number of fetches in a clause to "#Value
+>;
+
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
+ "R600ALUInst",
+ "false",
+ "Older version of ALU instructions encoding"
+>;
+
+def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
+def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
+
+def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
+ "HasVertexCache",
+ "true",
+ "Specify use of dedicated vertex cache"
+>;
+
+def FeatureCaymanISA : SubtargetFeature<"caymanISA",
+ "CaymanISA",
+ "true",
+ "Use Cayman ISA"
+>;
+
+def FeatureCFALUBug : SubtargetFeature<"cfalubug",
+ "CFALUBug",
+ "true",
+ "GPU has CF_ALU bug"
+>;
+
+class R600SubtargetFeatureGeneration <string Value,
+ list<SubtargetFeature> Implies> :
+ SubtargetFeatureGeneration <Value, "R600Subtarget", Implies>;
+
+def FeatureR600 : R600SubtargetFeatureGeneration<"R600",
+ [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
+>;
+
+def FeatureR700 : R600SubtargetFeatureGeneration<"R700",
+ [FeatureFetchLimit16, FeatureLocalMemorySize0]
+>;
+
+def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN",
+ [FeatureFetchLimit16, FeatureLocalMemorySize32768]
+>;
+
+def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
+ [FeatureFetchLimit16, FeatureWavefrontSize64,
+ FeatureLocalMemorySize32768]
+>;
+
+
//===----------------------------------------------------------------------===//
// Radeon HD 2000/3000 Series (R600).
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 7501facb0cba..38933e7616a0 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief R600 implementation of the TargetRegisterInfo class.
+/// R600 implementation of the TargetRegisterInfo class.
//
//===----------------------------------------------------------------------===//
@@ -17,47 +17,51 @@
#include "R600Defines.h"
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
using namespace llvm;
-R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() {
+R600RegisterInfo::R600RegisterInfo() : R600GenRegisterInfo(0) {
RCW.RegWeight = 0;
RCW.WeightLimit = 0;
}
+#define GET_REGINFO_TARGET_DESC
+#include "R600GenRegisterInfo.inc"
+
BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
const R600InstrInfo *TII = ST.getInstrInfo();
- Reserved.set(AMDGPU::ZERO);
- Reserved.set(AMDGPU::HALF);
- Reserved.set(AMDGPU::ONE);
- Reserved.set(AMDGPU::ONE_INT);
- Reserved.set(AMDGPU::NEG_HALF);
- Reserved.set(AMDGPU::NEG_ONE);
- Reserved.set(AMDGPU::PV_X);
- Reserved.set(AMDGPU::ALU_LITERAL_X);
- Reserved.set(AMDGPU::ALU_CONST);
- Reserved.set(AMDGPU::PREDICATE_BIT);
- Reserved.set(AMDGPU::PRED_SEL_OFF);
- Reserved.set(AMDGPU::PRED_SEL_ZERO);
- Reserved.set(AMDGPU::PRED_SEL_ONE);
- Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
-
- for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
- E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
- Reserved.set(*I);
+ reserveRegisterTuples(Reserved, R600::ZERO);
+ reserveRegisterTuples(Reserved, R600::HALF);
+ reserveRegisterTuples(Reserved, R600::ONE);
+ reserveRegisterTuples(Reserved, R600::ONE_INT);
+ reserveRegisterTuples(Reserved, R600::NEG_HALF);
+ reserveRegisterTuples(Reserved, R600::NEG_ONE);
+ reserveRegisterTuples(Reserved, R600::PV_X);
+ reserveRegisterTuples(Reserved, R600::ALU_LITERAL_X);
+ reserveRegisterTuples(Reserved, R600::ALU_CONST);
+ reserveRegisterTuples(Reserved, R600::PREDICATE_BIT);
+ reserveRegisterTuples(Reserved, R600::PRED_SEL_OFF);
+ reserveRegisterTuples(Reserved, R600::PRED_SEL_ZERO);
+ reserveRegisterTuples(Reserved, R600::PRED_SEL_ONE);
+ reserveRegisterTuples(Reserved, R600::INDIRECT_BASE_ADDR);
+
+ for (TargetRegisterClass::iterator I = R600::R600_AddrRegClass.begin(),
+ E = R600::R600_AddrRegClass.end(); I != E; ++I) {
+ reserveRegisterTuples(Reserved, *I);
}
- TII->reserveIndirectRegisters(Reserved, MF);
+ TII->reserveIndirectRegisters(Reserved, MF, *this);
return Reserved;
}
// Dummy to not crash RegisterClassInfo.
-static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
+static const MCPhysReg CalleeSavedReg = R600::NoRegister;
const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
const MachineFunction *) const {
@@ -65,7 +69,7 @@ const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
}
unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
- return AMDGPU::NoRegister;
+ return R600::NoRegister;
}
unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
@@ -80,7 +84,7 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
MVT VT) const {
switch(VT.SimpleTy) {
default:
- case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
+ case MVT::i32: return &R600::R600_TReg32RegClass;
}
}
@@ -93,9 +97,9 @@ bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
assert(!TargetRegisterInfo::isVirtualRegister(Reg));
switch (Reg) {
- case AMDGPU::OQAP:
- case AMDGPU::OQBP:
- case AMDGPU::AR_X:
+ case R600::OQAP:
+ case R600::OQBP:
+ case R600::AR_X:
return false;
default:
return true;
@@ -108,3 +112,10 @@ void R600RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
RegScavenger *RS) const {
llvm_unreachable("Subroutines not supported yet");
}
+
+void R600RegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
+ MCRegAliasIterator R(Reg, this, true);
+
+ for (; R.isValid(); ++R)
+ Reserved.set(*R);
+}
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h
index f0d9644b02f2..c4c77172b299 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -8,20 +8,19 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface definition for R600RegisterInfo
+/// Interface definition for R600RegisterInfo
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
#define LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
-#include "AMDGPURegisterInfo.h"
+#define GET_REGINFO_HEADER
+#include "R600GenRegisterInfo.inc"
namespace llvm {
-class AMDGPUSubtarget;
-
-struct R600RegisterInfo final : public AMDGPURegisterInfo {
+struct R600RegisterInfo final : public R600GenRegisterInfo {
RegClassWeight RCW;
R600RegisterInfo();
@@ -30,12 +29,12 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo {
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
unsigned getFrameRegister(const MachineFunction &MF) const override;
- /// \brief get the HW encoding for a register's channel.
+ /// get the HW encoding for a register's channel.
unsigned getHWRegChan(unsigned reg) const;
unsigned getHWRegIndex(unsigned Reg) const;
- /// \brief get the register class of the specified type to use in the
+ /// get the register class of the specified type to use in the
/// CFGStructurizer
const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const;
@@ -49,6 +48,8 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo {
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
+
+ void reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td
index 84ab328bdb2b..02164b74a01b 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.td
+++ b/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -245,7 +245,7 @@ def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
(add V0123_W, V0123_Z, V0123_Y, V0123_X)
>;
-def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32, i64, f64], 64,
(add (sequence "T%u_XY", 0, 63))>;
def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 150d8c3dc3d3..74f1bd8fb986 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -18,6 +18,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
@@ -37,7 +38,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
@@ -133,7 +133,7 @@ INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
char SIAnnotateControlFlow::ID = 0;
-/// \brief Initialize all the types and constants used in the pass
+/// Initialize all the types and constants used in the pass
bool SIAnnotateControlFlow::doInitialization(Module &M) {
LLVMContext &Context = M.getContext();
@@ -157,29 +157,29 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
return false;
}
-/// \brief Is the branch condition uniform or did the StructurizeCFG pass
+/// Is the branch condition uniform or did the StructurizeCFG pass
/// consider it as such?
bool SIAnnotateControlFlow::isUniform(BranchInst *T) {
return DA->isUniform(T->getCondition()) ||
T->getMetadata("structurizecfg.uniform") != nullptr;
}
-/// \brief Is BB the last block saved on the stack ?
+/// Is BB the last block saved on the stack ?
bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
return !Stack.empty() && Stack.back().first == BB;
}
-/// \brief Pop the last saved value from the control flow stack
+/// Pop the last saved value from the control flow stack
Value *SIAnnotateControlFlow::popSaved() {
return Stack.pop_back_val().second;
}
-/// \brief Push a BB and saved value to the control flow stack
+/// Push a BB and saved value to the control flow stack
void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
Stack.push_back(std::make_pair(BB, Saved));
}
-/// \brief Can the condition represented by this PHI node treated like
+/// Can the condition represented by this PHI node treated like
/// an "Else" block?
bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
@@ -198,14 +198,14 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
return true;
}
-// \brief Erase "Phi" if it is not used any more
+// Erase "Phi" if it is not used any more
void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
if (RecursivelyDeleteDeadPHINode(Phi)) {
- DEBUG(dbgs() << "Erased unused condition phi\n");
+ LLVM_DEBUG(dbgs() << "Erased unused condition phi\n");
}
}
-/// \brief Open a new "If" block
+/// Open a new "If" block
void SIAnnotateControlFlow::openIf(BranchInst *Term) {
if (isUniform(Term))
return;
@@ -215,7 +215,7 @@ void SIAnnotateControlFlow::openIf(BranchInst *Term) {
push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
}
-/// \brief Close the last "If" block and open a new "Else" block
+/// Close the last "If" block and open a new "Else" block
void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
if (isUniform(Term)) {
return;
@@ -225,7 +225,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
}
-/// \brief Recursively handle the condition leading to a loop
+/// Recursively handle the condition leading to a loop
Value *SIAnnotateControlFlow::handleLoopCondition(
Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term,
SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) {
@@ -322,7 +322,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
llvm_unreachable("Unhandled loop condition!");
}
-/// \brief Handle a back edge (loop)
+/// Handle a back edge (loop)
void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
if (isUniform(Term))
return;
@@ -353,7 +353,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
push(Term->getSuccessor(0), Arg);
}
-/// \brief Close the last opened control flow
+/// Close the last opened control flow
void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
llvm::Loop *L = LI->getLoopFor(BB);
@@ -381,7 +381,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
CallInst::Create(EndCf, Exec, "", FirstInsertionPt);
}
-/// \brief Annotate the control flow with intrinsics so the backend can
+/// Annotate the control flow with intrinsics so the backend can
/// recognize if/then/else and loops.
bool SIAnnotateControlFlow::runOnFunction(Function &F) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -422,11 +422,15 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
openIf(Term);
}
- assert(Stack.empty());
+ if (!Stack.empty()) {
+ // CFG was probably not structured.
+ report_fatal_error("failed to annotate CFG");
+ }
+
return true;
}
-/// \brief Create the annotation pass
+/// Create the annotation pass
FunctionPass *llvm::createSIAnnotateControlFlowPass() {
return new SIAnnotateControlFlow();
}
diff --git a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
index b5c439b21b89..7e884ad93a23 100644
--- a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
+++ b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Inserts one nop instruction for each high level source statement for
+/// Inserts one nop instruction for each high level source statement for
/// debugger usage.
///
/// Tools, such as a debugger, need to pause execution based on user input (i.e.
@@ -21,6 +21,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -62,7 +63,7 @@ FunctionPass *llvm::createSIDebuggerInsertNopsPass() {
bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
// Skip this pass if "amdgpu-debugger-insert-nops" attribute was not
// specified.
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.debuggerInsertNops())
return false;
@@ -78,8 +79,8 @@ bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
for (auto &MBB : MF) {
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
- // Skip DBG_VALUE instructions and instructions without location.
- if (MI->isDebugValue() || !MI->getDebugLoc())
+ // Skip debug instructions and instructions without location.
+ if (MI->isDebugInstr() || !MI->getDebugLoc())
continue;
// Insert nop instruction if line number does not have nop inserted.
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index a9f6069e798a..a6d28d6999e5 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -85,7 +85,10 @@ enum : uint64_t {
ClampHi = UINT64_C(1) << 48,
// Is a packed VOP3P instruction.
- IsPacked = UINT64_C(1) << 49
+ IsPacked = UINT64_C(1) << 49,
+
+ // Is a D16 buffer instruction.
+ D16Buf = UINT64_C(1) << 50
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -137,7 +140,6 @@ namespace AMDGPU {
OPERAND_INPUT_MODS,
// Operand for SDWA instructions
- OPERAND_SDWA_SRC,
OPERAND_SDWA_VOPC_DST,
/// Operand with 32-bit immediate that uses the constant bus.
@@ -146,6 +148,13 @@ namespace AMDGPU {
};
}
+namespace SIStackID {
+enum StackTypes : uint8_t {
+ SCRATCH = 0,
+ SGPR_SPILL = 1
+};
+}
+
// Input operand modifiers bit-masks
// NEG and SEXT share same bit-mask because they can't be set simultaneously.
namespace SISrcMods {
@@ -273,8 +282,9 @@ enum Id { // HwRegCode, (6) [5:0]
ID_GPR_ALLOC = 5,
ID_LDS_ALLOC = 6,
ID_IB_STS = 7,
- ID_SYMBOLIC_LAST_ = 8,
ID_MEM_BASES = 15,
+ ID_SYMBOLIC_FIRST_GFX9_ = ID_MEM_BASES,
+ ID_SYMBOLIC_LAST_ = 16,
ID_SHIFT_ = 0,
ID_WIDTH_ = 6,
ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
@@ -375,6 +385,44 @@ enum SDWA9EncValues{
};
} // namespace SDWA
+
+namespace DPP {
+
+enum DppCtrl {
+ QUAD_PERM_FIRST = 0,
+ QUAD_PERM_LAST = 0xFF,
+ DPP_UNUSED1 = 0x100,
+ ROW_SHL0 = 0x100,
+ ROW_SHL_FIRST = 0x101,
+ ROW_SHL_LAST = 0x10F,
+ DPP_UNUSED2 = 0x110,
+ ROW_SHR0 = 0x110,
+ ROW_SHR_FIRST = 0x111,
+ ROW_SHR_LAST = 0x11F,
+ DPP_UNUSED3 = 0x120,
+ ROW_ROR0 = 0x120,
+ ROW_ROR_FIRST = 0x121,
+ ROW_ROR_LAST = 0x12F,
+ WAVE_SHL1 = 0x130,
+ DPP_UNUSED4_FIRST = 0x131,
+ DPP_UNUSED4_LAST = 0x133,
+ WAVE_ROL1 = 0x134,
+ DPP_UNUSED5_FIRST = 0x135,
+ DPP_UNUSED5_LAST = 0x137,
+ WAVE_SHR1 = 0x138,
+ DPP_UNUSED6_FIRST = 0x139,
+ DPP_UNUSED6_LAST = 0x13B,
+ WAVE_ROR1 = 0x13C,
+ DPP_UNUSED7_FIRST = 0x13D,
+ DPP_UNUSED7_LAST = 0x13F,
+ ROW_MIRROR = 0x140,
+ ROW_HALF_MIRROR = 0x141,
+ BCAST15 = 0x142,
+ BCAST31 = 0x143,
+ DPP_LAST = BCAST31
+};
+
+} // namespace DPP
} // namespace AMDGPU
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 8b155c2d2780..566e0d3febc7 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -69,6 +69,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
@@ -81,7 +82,6 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/CodeGen.h"
@@ -110,12 +110,7 @@ namespace {
class SIFixSGPRCopies : public MachineFunctionPass {
MachineDominatorTree *MDT;
- MachinePostDominatorTree *MPDT;
- DenseMap<MachineBasicBlock *, SetVector<MachineBasicBlock*>> PDF;
- void computePDF(MachineFunction * MF);
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- void printPDF();
-#endif
+
public:
static char ID;
@@ -128,8 +123,6 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
- AU.addRequired<MachinePostDominatorTree>();
- AU.addPreserved<MachinePostDominatorTree>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -417,6 +410,12 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
return false;
}
+static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
+ const TargetRegisterInfo *TRI) {
+ return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
+ return hasTerminatorThatModifiesExec(*MBB, *TRI); });
+}
+
// Checks if there is potential path From instruction To instruction.
// If CutOff is specified and it sits in between of that path we ignore
// a higher portion of the path and report it is not reachable.
@@ -515,9 +514,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
if (MDT.dominates(MI1, MI2)) {
if (!intereferes(MI2, MI1)) {
- DEBUG(dbgs() << "Erasing from "
- << printMBBReference(*MI2->getParent()) << " "
- << *MI2);
+ LLVM_DEBUG(dbgs()
+ << "Erasing from "
+ << printMBBReference(*MI2->getParent()) << " " << *MI2);
MI2->eraseFromParent();
Defs.erase(I2++);
Changed = true;
@@ -525,9 +524,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
}
} else if (MDT.dominates(MI2, MI1)) {
if (!intereferes(MI1, MI2)) {
- DEBUG(dbgs() << "Erasing from "
- << printMBBReference(*MI1->getParent()) << " "
- << *MI1);
+ LLVM_DEBUG(dbgs()
+ << "Erasing from "
+ << printMBBReference(*MI1->getParent()) << " " << *MI1);
MI1->eraseFromParent();
Defs.erase(I1++);
Changed = true;
@@ -543,11 +542,12 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
if (!intereferes(MI1, I) && !intereferes(MI2, I)) {
- DEBUG(dbgs() << "Erasing from "
- << printMBBReference(*MI1->getParent()) << " " << *MI1
- << "and moving from "
- << printMBBReference(*MI2->getParent()) << " to "
- << printMBBReference(*I->getParent()) << " " << *MI2);
+ LLVM_DEBUG(dbgs()
+ << "Erasing from "
+ << printMBBReference(*MI1->getParent()) << " " << *MI1
+ << "and moving from "
+ << printMBBReference(*MI2->getParent()) << " to "
+ << printMBBReference(*I->getParent()) << " " << *MI2);
I->getParent()->splice(I, MI2->getParent(), MI2);
MI1->eraseFromParent();
Defs.erase(I1++);
@@ -567,47 +567,12 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
return Changed;
}
-void SIFixSGPRCopies::computePDF(MachineFunction *MF) {
- MachineFunction::iterator B = MF->begin();
- MachineFunction::iterator E = MF->end();
- for (; B != E; ++B) {
- if (B->succ_size() > 1) {
- for (auto S : B->successors()) {
- MachineDomTreeNode *runner = MPDT->getNode(&*S);
- MachineDomTreeNode *sentinel = MPDT->getNode(&*B)->getIDom();
- while (runner && runner != sentinel) {
- PDF[runner->getBlock()].insert(&*B);
- runner = runner->getIDom();
- }
- }
- }
- }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void SIFixSGPRCopies::printPDF() {
- dbgs() << "\n######## PostDominanceFrontiers set #########\n";
- for (auto &I : PDF) {
- dbgs() << "PDF[ " << I.first->getNumber() << "] : ";
- for (auto &J : I.second) {
- dbgs() << J->getNumber() << ' ';
- }
- dbgs() << '\n';
- }
- dbgs() << "\n##############################################\n";
-}
-#endif
-
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
MDT = &getAnalysis<MachineDominatorTree>();
- MPDT = &getAnalysis<MachinePostDominatorTree>();
- PDF.clear();
- computePDF(&MF);
- DEBUG(printPDF());
SmallVector<MachineInstr *, 16> Worklist;
@@ -661,28 +626,17 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
break;
- // We don't need to fix the PHI if all the source blocks
- // have no divergent control dependecies
+ // We don't need to fix the PHI if the common dominator of the
+ // two incoming blocks terminates with a uniform branch.
bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
- if (!HasVGPROperand) {
- bool Uniform = true;
- MachineBasicBlock * Join = MI.getParent();
- for (auto &O : MI.explicit_operands()) {
- if (O.isMBB()) {
- MachineBasicBlock * Source = O.getMBB();
- SetVector<MachineBasicBlock*> &SourcePDF = PDF[Source];
- SetVector<MachineBasicBlock*> &JoinPDF = PDF[Join];
- SetVector<MachineBasicBlock*> CDList;
- for (auto &I : SourcePDF) {
- if (!JoinPDF.count(I) || /* back edge */MDT->dominates(Join, I)) {
- if (hasTerminatorThatModifiesExec(*I, *TRI))
- Uniform = false;
- }
- }
- }
- }
- if (Uniform) {
- DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
+ if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
+ MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
+ MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
+
+ if (!predsHasDivergentTerminator(MBB0, TRI) &&
+ !predsHasDivergentTerminator(MBB1, TRI)) {
+ LLVM_DEBUG(dbgs()
+ << "Not fixing PHI for uniform branch: " << MI << '\n');
break;
}
}
@@ -722,7 +676,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
SmallSet<unsigned, 8> Visited;
if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
- DEBUG(dbgs() << "Fixing PHI: " << MI);
+ LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
TII->moveToVALU(MI);
}
break;
@@ -734,7 +688,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
continue;
}
- DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
+ LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
TII->moveToVALU(MI);
break;
@@ -745,7 +699,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
if (TRI->isSGPRClass(DstRC) &&
(TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
- DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
+ LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
TII->moveToVALU(MI);
}
break;
diff --git a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
index 3d3121788b5e..15ba78edf919 100644
--- a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -8,13 +8,14 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Add implicit use of exec to vector register copies.
+/// Add implicit use of exec to vector register copies.
///
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
using namespace llvm;
@@ -46,7 +47,7 @@ char SIFixVGPRCopies::ID = 0;
char &llvm::SIFixVGPRCopiesID = SIFixVGPRCopies::ID;
bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
bool Changed = false;
@@ -58,7 +59,7 @@ bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) {
if (TII->isVGPRCopy(MI) && !MI.readsRegister(AMDGPU::EXEC, TRI)) {
MI.addOperand(MF,
MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
- DEBUG(dbgs() << "Add exec use to " << MI);
+ LLVM_DEBUG(dbgs() << "Add exec use to " << MI);
Changed = true;
}
break;
diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
index 3493c7775f0c..5d613d8874fa 100644
--- a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
+++ b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Computations in WWM can overwrite values in inactive channels for
+/// Computations in WWM can overwrite values in inactive channels for
/// variables that the register allocator thinks are dead. This pass adds fake
/// uses of those variables to WWM instructions to make sure that they aren't
/// overwritten.
@@ -55,6 +55,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SparseBitVector.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -184,7 +185,7 @@ bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
// This doesn't actually need LiveIntervals, but we can preserve them.
LIS = getAnalysisIfAvailable<LiveIntervals>();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 783181980342..338cabcb906b 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -13,6 +13,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -75,7 +76,7 @@ public:
MachineRegisterInfo *MRI;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
- const SISubtarget *ST;
+ const GCNSubtarget *ST;
void foldOperand(MachineOperand &OpToFold,
MachineInstr *UseMI,
@@ -127,14 +128,18 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
unsigned Opc = UseMI.getOpcode();
switch (Opc) {
case AMDGPU::V_MAC_F32_e64:
- case AMDGPU::V_MAC_F16_e64: {
+ case AMDGPU::V_MAC_F16_e64:
+ case AMDGPU::V_FMAC_F32_e64: {
// Special case for mac. Since this is replaced with mad when folded into
// src2, we need to check the legality for the final instruction.
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (static_cast<int>(OpNo) == Src2Idx) {
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
- const MCInstrDesc &MadDesc
- = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+
+ unsigned Opc = IsFMA ?
+ AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+ const MCInstrDesc &MadDesc = TII->get(Opc);
return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
}
return false;
@@ -155,6 +160,35 @@ static bool updateOperand(FoldCandidate &Fold,
assert(Old.isReg());
if (Fold.isImm()) {
+ if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) {
+ // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
+ // already set.
+ unsigned Opcode = MI->getOpcode();
+ int OpNo = MI->getOperandNo(&Old);
+ int ModIdx = -1;
+ if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
+ ModIdx = AMDGPU::OpName::src0_modifiers;
+ else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
+ ModIdx = AMDGPU::OpName::src1_modifiers;
+ else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
+ ModIdx = AMDGPU::OpName::src2_modifiers;
+ assert(ModIdx != -1);
+ ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
+ MachineOperand &Mod = MI->getOperand(ModIdx);
+ unsigned Val = Mod.getImm();
+ if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
+ return false;
+ // If upper part is all zero we do not need op_sel_hi.
+ if (!isUInt<16>(Fold.ImmToFold)) {
+ if (!(Fold.ImmToFold & 0xffff)) {
+ Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+ Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+ return true;
+ }
+ Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ }
+ }
Old.ChangeToImmediate(Fold.ImmToFold);
return true;
}
@@ -195,13 +229,17 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
unsigned Opc = MI->getOpcode();
- if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
+ if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64) &&
(int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+ unsigned NewOpc = IsFMA ?
+ AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
// to fold the operand.
- MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16));
+ MI->setDesc(TII->get(NewOpc));
bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
if (FoldAsMAD) {
MI->untieRegOperand(OpNo);
@@ -345,6 +383,7 @@ void SIFoldOperands::foldOperand(
// Don't fold into target independent nodes. Target independent opcodes
// don't have defined register classes.
if (UseDesc.isVariadic() ||
+ UseOp.isImplicit() ||
UseDesc.OpInfo[UseOpIdx].RegClass == -1)
return;
}
@@ -470,7 +509,8 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
MachineOperand &Op) {
if (Op.isReg()) {
// If this has a subregister, it obviously is a register source.
- if (Op.getSubReg() != AMDGPU::NoSubRegister)
+ if (Op.getSubReg() != AMDGPU::NoSubRegister ||
+ !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
return &Op;
MachineInstr *Def = MRI.getVRegDef(Op.getReg());
@@ -598,14 +638,14 @@ static bool tryFoldInst(const SIInstrInfo *TII,
const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
if (Src1->isIdenticalTo(*Src0)) {
- DEBUG(dbgs() << "Folded " << *MI << " into ");
+ LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (Src2Idx != -1)
MI->RemoveOperand(Src2Idx);
MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
: getMovOpc(false)));
- DEBUG(dbgs() << *MI << '\n');
+ LLVM_DEBUG(dbgs() << *MI << '\n');
return true;
}
}
@@ -646,7 +686,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
// be folded due to multiple uses or operand constraints.
if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
- DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n');
+ LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
// Some constant folding cases change the same immediate's use to a new
// instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
@@ -713,8 +753,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
// copies.
MRI->clearKillFlags(Fold.OpToFold->getReg());
}
- DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
- static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+ LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
+ << static_cast<int>(Fold.UseOpNo) << " of "
+ << *Fold.UseMI << '\n');
tryFoldInst(TII, Fold.UseMI);
} else if (Fold.isCommuted()) {
// Restoring instruction's original operand order if fold has failed.
@@ -794,7 +835,8 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
if (!DefClamp)
return false;
- DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n');
+ LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
+ << '\n');
// Clamp is applied after omod, so it is OK if omod is set.
DefClamp->setImm(1);
@@ -917,7 +959,7 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
return false;
- DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
+ LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
DefOMod->setImm(OMod);
MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
@@ -930,7 +972,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
return false;
MRI = &MF.getRegInfo();
- ST = &MF.getSubtarget<SISubtarget>();
+ ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
new file mode 100644
index 000000000000..cd14239de822
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -0,0 +1,398 @@
+//===-- SIFormMemoryClauses.cpp -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass creates bundles of SMEM and VMEM instructions forming memory
+/// clauses if XNACK is enabled. Def operands of clauses are marked as early
+/// clobber to make sure we will not override any source within a clause.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "GCNRegPressure.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-form-memory-clauses"
+
+// Clauses longer then 15 instructions would overflow one of the counters
+// and stall. They can stall even earlier if there are outstanding counters.
+static cl::opt<unsigned>
+MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(15),
+ cl::desc("Maximum length of a memory clause, instructions"));
+
+namespace {
+
+class SIFormMemoryClauses : public MachineFunctionPass {
+ typedef DenseMap<unsigned, std::pair<unsigned, LaneBitmask>> RegUse;
+
+public:
+ static char ID;
+
+public:
+ SIFormMemoryClauses() : MachineFunctionPass(ID) {
+ initializeSIFormMemoryClausesPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI Form memory clauses";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ template <typename Callable>
+ void forAllLanes(unsigned Reg, LaneBitmask LaneMask, Callable Func) const;
+
+ bool canBundle(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
+ bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT);
+ void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
+ bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses,
+ GCNDownwardRPTracker &RPT);
+
+ const GCNSubtarget *ST;
+ const SIRegisterInfo *TRI;
+ const MachineRegisterInfo *MRI;
+ SIMachineFunctionInfo *MFI;
+
+ unsigned LastRecordedOccupancy;
+ unsigned MaxVGPRs;
+ unsigned MaxSGPRs;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFormMemoryClauses, DEBUG_TYPE,
+ "SI Form memory clauses", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIFormMemoryClauses, DEBUG_TYPE,
+ "SI Form memory clauses", false, false)
+
+
+char SIFormMemoryClauses::ID = 0;
+
+char &llvm::SIFormMemoryClausesID = SIFormMemoryClauses::ID;
+
+FunctionPass *llvm::createSIFormMemoryClausesPass() {
+ return new SIFormMemoryClauses();
+}
+
+static bool isVMEMClauseInst(const MachineInstr &MI) {
+ return SIInstrInfo::isFLAT(MI) || SIInstrInfo::isVMEM(MI);
+}
+
+static bool isSMEMClauseInst(const MachineInstr &MI) {
+ return SIInstrInfo::isSMRD(MI);
+}
+
+// There no sense to create store clauses, they do not define anything,
+// thus there is nothing to set early-clobber.
+static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
+ if (MI.isDebugValue() || MI.isBundled())
+ return false;
+ if (!MI.mayLoad() || MI.mayStore())
+ return false;
+ if (AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1 ||
+ AMDGPU::getAtomicRetOp(MI.getOpcode()) != -1)
+ return false;
+ if (IsVMEMClause && !isVMEMClauseInst(MI))
+ return false;
+ if (!IsVMEMClause && !isSMEMClauseInst(MI))
+ return false;
+ return true;
+}
+
+static unsigned getMopState(const MachineOperand &MO) {
+ unsigned S = 0;
+ if (MO.isImplicit())
+ S |= RegState::Implicit;
+ if (MO.isDead())
+ S |= RegState::Dead;
+ if (MO.isUndef())
+ S |= RegState::Undef;
+ if (MO.isKill())
+ S |= RegState::Kill;
+ if (MO.isEarlyClobber())
+ S |= RegState::EarlyClobber;
+ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && MO.isRenamable())
+ S |= RegState::Renamable;
+ return S;
+}
+
+template <typename Callable>
+void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask,
+ Callable Func) const {
+ if (LaneMask.all() || TargetRegisterInfo::isPhysicalRegister(Reg) ||
+ LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) {
+ Func(0);
+ return;
+ }
+
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+ unsigned E = TRI->getNumSubRegIndices();
+ SmallVector<unsigned, AMDGPU::NUM_TARGET_SUBREGS> CoveringSubregs;
+ for (unsigned Idx = 1; Idx < E; ++Idx) {
+ // Is this index even compatible with the given class?
+ if (TRI->getSubClassWithSubReg(RC, Idx) != RC)
+ continue;
+ LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+ // Early exit if we found a perfect match.
+ if (SubRegMask == LaneMask) {
+ Func(Idx);
+ return;
+ }
+
+ if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
+ continue;
+
+ CoveringSubregs.push_back(Idx);
+ }
+
+ llvm::sort(CoveringSubregs.begin(), CoveringSubregs.end(),
+ [this](unsigned A, unsigned B) {
+ LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A);
+ LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B);
+ unsigned NA = MaskA.getNumLanes();
+ unsigned NB = MaskB.getNumLanes();
+ if (NA != NB)
+ return NA > NB;
+ return MaskA.getHighestLane() > MaskB.getHighestLane();
+ });
+
+ for (unsigned Idx : CoveringSubregs) {
+ LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+ if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
+ continue;
+
+ Func(Idx);
+ LaneMask &= ~SubRegMask;
+ if (LaneMask.none())
+ return;
+ }
+
+ llvm_unreachable("Failed to find all subregs to cover lane mask");
+}
+
+// Returns false if there is a use of a def already in the map.
+// In this case we must break the clause.
+bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
+ RegUse &Defs, RegUse &Uses) const {
+ // Check interference with defs.
+ for (const MachineOperand &MO : MI.operands()) {
+ // TODO: Prologue/Epilogue Insertion pass does not process bundled
+ // instructions.
+ if (MO.isFI())
+ return false;
+
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg();
+
+ // If it is tied we will need to write same register as we read.
+ if (MO.isTied())
+ return false;
+
+ RegUse &Map = MO.isDef() ? Uses : Defs;
+ auto Conflict = Map.find(Reg);
+ if (Conflict == Map.end())
+ continue;
+
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return false;
+
+ LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+ if ((Conflict->second.second & Mask).any())
+ return false;
+ }
+
+ return true;
+}
+
+// Since all defs in the clause are early clobber we can run out of registers.
+// Function returns false if pressure would hit the limit if instruction is
+// bundled into a memory clause.
+bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI,
+ GCNDownwardRPTracker &RPT) {
+ // NB: skip advanceBeforeNext() call. Since all defs will be marked
+ // early-clobber they will all stay alive at least to the end of the
+ // clause. Therefor we should not decrease pressure even if load
+ // pointer becomes dead and could otherwise be reused for destination.
+ RPT.advanceToNext();
+ GCNRegPressure MaxPressure = RPT.moveMaxPressure();
+ unsigned Occupancy = MaxPressure.getOccupancy(*ST);
+ if (Occupancy >= MFI->getMinAllowedOccupancy() &&
+ MaxPressure.getVGPRNum() <= MaxVGPRs &&
+ MaxPressure.getSGPRNum() <= MaxSGPRs) {
+ LastRecordedOccupancy = Occupancy;
+ return true;
+ }
+ return false;
+}
+
+// Collect register defs and uses along with their lane masks and states.
+void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
+ RegUse &Defs, RegUse &Uses) const {
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!Reg)
+ continue;
+
+ LaneBitmask Mask = TargetRegisterInfo::isVirtualRegister(Reg) ?
+ TRI->getSubRegIndexLaneMask(MO.getSubReg()) :
+ LaneBitmask::getAll();
+ RegUse &Map = MO.isDef() ? Defs : Uses;
+
+ auto Loc = Map.find(Reg);
+ unsigned State = getMopState(MO);
+ if (Loc == Map.end()) {
+ Map[Reg] = std::make_pair(State, Mask);
+ } else {
+ Loc->second.first |= State;
+ Loc->second.second |= Mask;
+ }
+ }
+}
+
+// Check register def/use conflicts, occupancy limits and collect def/use maps.
+// Return true if instruction can be bundled with previous. It it cannot
+// def/use maps are not updated.
+bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI,
+ RegUse &Defs, RegUse &Uses,
+ GCNDownwardRPTracker &RPT) {
+ if (!canBundle(MI, Defs, Uses))
+ return false;
+
+ if (!checkPressure(MI, RPT))
+ return false;
+
+ collectRegUses(MI, Defs, Uses);
+ return true;
+}
+
+bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->isXNACKEnabled())
+ return false;
+
+ const SIInstrInfo *TII = ST->getInstrInfo();
+ TRI = ST->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ MFI = MF.getInfo<SIMachineFunctionInfo>();
+ LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+ SlotIndexes *Ind = LIS->getSlotIndexes();
+ bool Changed = false;
+
+ MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count();
+ MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count();
+
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::instr_iterator Next;
+ for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; I = Next) {
+ MachineInstr &MI = *I;
+ Next = std::next(I);
+
+ bool IsVMEM = isVMEMClauseInst(MI);
+
+ if (!isValidClauseInst(MI, IsVMEM))
+ continue;
+
+ RegUse Defs, Uses;
+ GCNDownwardRPTracker RPT(*LIS);
+ RPT.reset(MI);
+
+ if (!processRegUses(MI, Defs, Uses, RPT))
+ continue;
+
+ unsigned Length = 1;
+ for ( ; Next != E && Length < MaxClause; ++Next) {
+ if (!isValidClauseInst(*Next, IsVMEM))
+ break;
+
+ // A load from pointer which was loaded inside the same bundle is an
+ // impossible clause because we will need to write and read the same
+ // register inside. In this case processRegUses will return false.
+ if (!processRegUses(*Next, Defs, Uses, RPT))
+ break;
+
+ ++Length;
+ }
+ if (Length < 2)
+ continue;
+
+ Changed = true;
+ MFI->limitOccupancy(LastRecordedOccupancy);
+
+ auto B = BuildMI(MBB, I, DebugLoc(), TII->get(TargetOpcode::BUNDLE));
+ Ind->insertMachineInstrInMaps(*B);
+
+ for (auto BI = I; BI != Next; ++BI) {
+ BI->bundleWithPred();
+ Ind->removeSingleMachineInstrFromMaps(*BI);
+
+ for (MachineOperand &MO : BI->defs())
+ if (MO.readsReg())
+ MO.setIsInternalRead(true);
+ }
+
+ for (auto &&R : Defs) {
+ forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
+ unsigned S = R.second.first | RegState::EarlyClobber;
+ if (!SubReg)
+ S &= ~(RegState::Undef | RegState::Dead);
+ B.addDef(R.first, S, SubReg);
+ });
+ }
+
+ for (auto &&R : Uses) {
+ forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
+ B.addUse(R.first, R.second.first & ~RegState::Kill, SubReg);
+ });
+ }
+
+ for (auto &&R : Defs) {
+ unsigned Reg = R.first;
+ Uses.erase(Reg);
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ continue;
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+
+ for (auto &&R : Uses) {
+ unsigned Reg = R.first;
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ continue;
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+ }
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 89bb98dbd028..ac0ef90f25a4 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -12,7 +12,9 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -21,19 +23,19 @@
using namespace llvm;
-static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST,
+static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
const MachineFunction &MF) {
return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
ST.getMaxNumSGPRs(MF) / 4);
}
-static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST,
+static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
const MachineFunction &MF) {
return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
ST.getMaxNumSGPRs(MF));
}
-void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
+void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
MachineFunction &MF,
MachineBasicBlock &MBB) const {
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -96,7 +98,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
}
unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
- const SISubtarget &ST,
+ const GCNSubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
@@ -147,7 +149,7 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
// SGPRs.
std::pair<unsigned, unsigned>
SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
- const SISubtarget &ST,
+ const GCNSubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
@@ -218,7 +220,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
// Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
// specified.
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (ST.debuggerEmitPrologue())
emitDebuggerPrologue(MF, MBB);
@@ -235,6 +237,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
+ const Function &F = MF.getFunction();
// We need to do the replacement of the private segment buffer and wave offset
// register even if there are no stack objects. There could be stores to undef
@@ -286,7 +289,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
- if (ST.isAmdCodeObjectV2(MF)) {
+ if (ST.isAmdCodeObjectV2(F)) {
PreloadedPrivateBufferReg = MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
}
@@ -305,7 +308,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
}
if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
- assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF));
+ assert(ST.isAmdCodeObjectV2(F) || ST.isMesaGfxShader(F));
MRI.addLiveIn(PreloadedPrivateBufferReg);
MBB.addLiveIn(PreloadedPrivateBufferReg);
}
@@ -330,7 +333,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
bool CopyBuffer = ResourceRegUsed &&
PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
- ST.isAmdCodeObjectV2(MF) &&
+ ST.isAmdCodeObjectV2(F) &&
ScratchRsrcReg != PreloadedPrivateBufferReg;
// This needs to be careful of the copying order to avoid overwriting one of
@@ -361,13 +364,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
}
// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
-void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
+void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
unsigned ScratchRsrcReg) const {
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ const Function &Fn = MF.getFunction();
DebugLoc DL;
if (ST.isAmdPalOS()) {
@@ -387,12 +391,27 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
BuildMI(MBB, I, DL, GetPC64, Rsrc01);
}
+ auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
+ if (ST.hasMergedShaders()) {
+ switch (MF.getFunction().getCallingConv()) {
+ case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_GS:
+ // Low GIT address is passed in s8 rather than s0 for an LS+HS or
+ // ES+GS merged shader on gfx9+.
+ GitPtrLo = AMDGPU::SGPR8;
+ break;
+ default:
+ break;
+ }
+ }
+ MF.getRegInfo().addLiveIn(GitPtrLo);
+ MF.front().addLiveIn(GitPtrLo);
BuildMI(MBB, I, DL, SMovB32, RsrcLo)
- .addReg(AMDGPU::SGPR0) // Low address passed in
+ .addReg(GitPtrLo)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
// We now have the GIT ptr - now get the scratch descriptor from the entry
- // at offset 0.
+ // at offset 0 (or offset 16 for a compute shader).
PointerType *PtrTy =
PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
AMDGPUAS::CONSTANT_ADDRESS);
@@ -403,17 +422,18 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
MachineMemOperand::MOInvariant |
MachineMemOperand::MODereferenceable,
0, 0);
+ unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
.addReg(Rsrc01)
- .addImm(0) // offset
+ .addImm(Offset) // offset
.addImm(0) // glc
.addReg(ScratchRsrcReg, RegState::ImplicitDefine)
.addMemOperand(MMO);
return;
}
- if (ST.isMesaGfxShader(MF)
+ if (ST.isMesaGfxShader(Fn)
|| (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
- assert(!ST.isAmdCodeObjectV2(MF));
+ assert(!ST.isAmdCodeObjectV2(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
@@ -474,17 +494,52 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
}
}
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer. We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
+ MachineFunction *MF = MBB.getParent();
+
+ const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+ LivePhysRegs LiveRegs(TRI);
+ LiveRegs.addLiveIns(MBB);
+
+ // Mark callee saved registers as used so we will not choose them.
+ const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ LiveRegs.addReg(CSRegs[i]);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
+ if (LiveRegs.available(MRI, Reg))
+ return Reg;
+ }
+
+ return AMDGPU::NoRegister;
+}
+
void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
if (FuncInfo->isEntryFunction()) {
emitEntryFunctionPrologue(MF, MBB);
return;
}
const MachineFrameInfo &MFI = MF.getFrameInfo();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
@@ -492,8 +547,34 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock::iterator MBBI = MBB.begin();
DebugLoc DL;
+ // XXX - Is this the right predicate?
+
bool NeedFP = hasFP(MF);
- if (NeedFP) {
+ uint32_t NumBytes = MFI.getStackSize();
+ uint32_t RoundedSize = NumBytes;
+ const bool NeedsRealignment = TRI.needsStackRealignment(MF);
+
+ if (NeedsRealignment) {
+ assert(NeedFP);
+ const unsigned Alignment = MFI.getMaxAlignment();
+
+ RoundedSize += Alignment;
+
+ unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
+ assert(ScratchSPReg != AMDGPU::NoRegister);
+
+ // s_add_u32 tmp_reg, s32, NumBytes
+ // s_and_b32 s32, tmp_reg, 0b111...0000
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
+ .addReg(StackPtrReg)
+ .addImm((Alignment - 1) * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
+ .addReg(ScratchSPReg, RegState::Kill)
+ .addImm(-Alignment * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameSetup);
+ FuncInfo->setIsStackRealigned(true);
+ } else if (NeedFP) {
// If we need a base pointer, set it up here. It's whatever the value of
// the stack pointer is at this point. Any variable size objects will be
// allocated after this, so we can still use the base pointer to reference
@@ -503,11 +584,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
- uint32_t NumBytes = MFI.getStackSize();
- if (NumBytes != 0 && hasSP(MF)) {
+ if (RoundedSize != 0 && hasSP(MF)) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
.addReg(StackPtrReg)
- .addImm(NumBytes * ST.getWavefrontSize())
+ .addImm(RoundedSize * ST.getWavefrontSize())
.setMIFlag(MachineInstr::FrameSetup);
}
@@ -527,7 +607,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
if (FuncInfo->isEntryFunction())
return;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
@@ -553,10 +633,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
// it's really whether we need SP to be accurate or not.
if (NumBytes != 0 && hasSP(MF)) {
+ uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
+ NumBytes + MFI.getMaxAlignment() : NumBytes;
+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
.addReg(StackPtrReg)
- .addImm(NumBytes * ST.getWavefrontSize())
- .setMIFlag(MachineInstr::FrameDestroy);
+ .addImm(RoundedSize * ST.getWavefrontSize());
}
}
@@ -572,7 +654,7 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const {
- const SIRegisterInfo *RI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+ const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
FrameReg = RI->getFrameRegister(MF);
return MF.getFrameInfo().getObjectOffset(FI);
@@ -586,7 +668,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
if (!MFI.hasStackObjects())
return;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -611,6 +693,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
if (TII->isSGPRSpill(MI)) {
int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
+ assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL);
if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
(void)Spilled;
@@ -667,7 +750,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
if (Amount == 0)
return MBB.erase(I);
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const DebugLoc &DL = I->getDebugLoc();
unsigned Opc = I->getOpcode();
@@ -696,7 +779,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -746,7 +829,8 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
}
bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
+ const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
// All stack operations are relative to the frame offset SGPR.
const MachineFrameInfo &MFI = MF.getFrameInfo();
- return MFI.hasCalls() || MFI.hasVarSizedObjects();
+ return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF);
}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index df6f1632a316..2f35b3631cdc 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -17,7 +17,7 @@ namespace llvm {
class SIInstrInfo;
class SIMachineFunctionInfo;
class SIRegisterInfo;
-class SISubtarget;
+class GCNSubtarget;
class SIFrameLowering final : public AMDGPUFrameLowering {
public:
@@ -48,29 +48,29 @@ public:
MachineBasicBlock::iterator MI) const override;
private:
- void emitFlatScratchInit(const SISubtarget &ST,
+ void emitFlatScratchInit(const GCNSubtarget &ST,
MachineFunction &MF,
MachineBasicBlock &MBB) const;
unsigned getReservedPrivateSegmentBufferReg(
- const SISubtarget &ST,
+ const GCNSubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
MachineFunction &MF) const;
std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg(
- const SISubtarget &ST,
+ const GCNSubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
MachineFunction &MF) const;
- /// \brief Emits debugger prologue.
+ /// Emits debugger prologue.
void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
- void emitEntryFunctionScratchSetup(const SISubtarget &ST, MachineFunction &MF,
+ void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF,
MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
unsigned ScratchRsrcReg) const;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 50ee88fa635a..5b7fc2656a20 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Custom DAG lowering for SI
+/// Custom DAG lowering for SI
//
//===----------------------------------------------------------------------===//
@@ -26,6 +26,7 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
@@ -49,7 +50,6 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
@@ -73,6 +73,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"
#include <cassert>
@@ -111,8 +112,9 @@ static unsigned findFirstFreeSGPR(CCState &CCInfo) {
}
SITargetLowering::SITargetLowering(const TargetMachine &TM,
- const SISubtarget &STI)
- : AMDGPUTargetLowering(TM, STI) {
+ const GCNSubtarget &STI)
+ : AMDGPUTargetLowering(TM, STI),
+ Subtarget(&STI) {
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
@@ -138,14 +140,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->has16BitInsts()) {
addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
- }
- if (Subtarget->hasVOP3PInsts()) {
+ // Unless there are also VOP3P operations, not operations are really legal.
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
+ addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
}
- computeRegisterProperties(STI.getRegisterInfo());
+ computeRegisterProperties(Subtarget->getRegisterInfo());
// We need to custom lower vector stores from local memory
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
@@ -173,7 +176,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
- setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
setOperationAction(ISD::SELECT, MVT::i1, Promote);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
@@ -205,13 +207,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -231,13 +237,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
#endif
- //setOperationAction(ISD::ADDC, MVT::i64, Expand);
- //setOperationAction(ISD::SUBC, MVT::i64, Expand);
-
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
- MVT::v2i64, MVT::v2f64}) {
+ MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -260,6 +263,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
}
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
+
// TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
// is expanded to avoid having two separate loops in case the index is a VGPR.
@@ -284,12 +289,30 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+
// Avoid stack access for these.
// TODO: Generalize to more vector types.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
+
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
// and output demarshalling
@@ -301,7 +324,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
- if (getSubtarget()->hasFlatAddressSpace()) {
+ if (Subtarget->hasFlatAddressSpace()) {
setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
}
@@ -314,13 +337,56 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::TRAP, MVT::Other, Custom);
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
+ if (Subtarget->has16BitInsts()) {
+ setOperationAction(ISD::FLOG, MVT::f16, Custom);
+ setOperationAction(ISD::FLOG10, MVT::f16, Custom);
+ }
+
+ // v_mad_f32 does not support denormals according to some sources.
+ if (!Subtarget->hasFP32Denormals())
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+ if (!Subtarget->hasBFI()) {
+ // fcopysign can be done in a single instruction with BFI.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ }
+
+ if (!Subtarget->hasBCNT(32))
+ setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+ if (!Subtarget->hasBCNT(64))
+ setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+ if (Subtarget->hasFFBH())
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+
+ if (Subtarget->hasFFBL())
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+
+ // We only really have 32-bit BFE instructions (and 16-bit on VI).
+ //
+ // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
+ // effort to match them now. We want this to be false for i64 cases when the
+ // extraction isn't restricted to the upper or lower half. Ideally we would
+ // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
+ // span the midpoint are probably relatively rare, so don't worry about them
+ // for now.
+ if (Subtarget->hasBFE())
+ setHasExtractBitsInsn(true);
+
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
- if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
setOperationAction(ISD::FRINT, MVT::f64, Legal);
+ } else {
+ setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+ setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+ setOperationAction(ISD::FRINT, MVT::f64, Custom);
+ setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
}
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
@@ -357,6 +423,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
setOperationAction(ISD::CTLZ, MVT::i16, Promote);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
+ setOperationAction(ISD::CTPOP, MVT::i16, Promote);
setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
@@ -406,10 +473,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::f16, Legal);
if (!Subtarget->hasFP16Denormals())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
- }
- if (Subtarget->hasVOP3PInsts()) {
- for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
+ for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -436,6 +501,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::Constant, MVT::v2i16, Legal);
setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
+ setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
+ setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
+
setOperationAction(ISD::STORE, MVT::v2i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
setOperationAction(ISD::STORE, MVT::v2f16, Promote);
@@ -452,11 +520,38 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
setOperationAction(ISD::XOR, MVT::v2i16, Promote);
AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
- setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
- AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
- setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
- AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
+ setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
+ setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
+
+ setOperationAction(ISD::STORE, MVT::v4i16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
+ setOperationAction(ISD::STORE, MVT::v4f16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
+
+ setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+
+ setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
+
+ if (!Subtarget->hasVOP3PInsts()) {
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
+ }
+
+ setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
+ // This isn't really legal, but this avoids the legalizer unrolling it (and
+ // allows matching fneg (fabs x) patterns)
+ setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+ }
+
+ if (Subtarget->hasVOP3PInsts()) {
setOperationAction(ISD::ADD, MVT::v2i16, Legal);
setOperationAction(ISD::SUB, MVT::v2i16, Legal);
setOperationAction(ISD::MUL, MVT::v2i16, Legal);
@@ -469,26 +564,51 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
setOperationAction(ISD::FADD, MVT::v2f16, Legal);
- setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
setOperationAction(ISD::FMA, MVT::v2f16, Legal);
setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
-
- // This isn't really legal, but this avoids the legalizer unrolling it (and
- // allows matching fneg (fabs x) patterns)
- setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+ setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
- setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+ setOperationAction(ISD::SHL, MVT::v4i16, Custom);
+ setOperationAction(ISD::SRA, MVT::v4i16, Custom);
+ setOperationAction(ISD::SRL, MVT::v4i16, Custom);
+ setOperationAction(ISD::ADD, MVT::v4i16, Custom);
+ setOperationAction(ISD::SUB, MVT::v4i16, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i16, Custom);
+
+ setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
+ setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
+ setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
+ setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
+
+ setOperationAction(ISD::FADD, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
+
+ setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
+ }
+
+ setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f16, Custom);
+
+ if (Subtarget->has16BitInsts()) {
+ setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
+ setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
} else {
+ // Legalization hack.
setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
+
+ setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
+ setOperationAction(ISD::FABS, MVT::v2f16, Custom);
}
for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
@@ -503,6 +623,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
+ setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SMIN);
setTargetDAGCombine(ISD::SMAX);
setTargetDAGCombine(ISD::UMIN);
@@ -540,16 +661,33 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
setSchedulingPreference(Sched::RegPressure);
+
+ // SI at least has hardware support for floating point exceptions, but no way
+ // of using or handling them is implemented. They are also optional in OpenCL
+ // (Section 7.3)
+ setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
}
-const SISubtarget *SITargetLowering::getSubtarget() const {
- return static_cast<const SISubtarget *>(Subtarget);
+const GCNSubtarget *SITargetLowering::getSubtarget() const {
+ return Subtarget;
}
//===----------------------------------------------------------------------===//
// TargetLowering queries
//===----------------------------------------------------------------------===//
+// v_mad_mix* support a conversion from f16 to f32.
+//
+// There is only one special case when denormals are enabled we don't currently,
+// where this is OK to use.
+bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
+ EVT DestVT, EVT SrcVT) const {
+ return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
+ DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
+ SrcVT.getScalarType() == MVT::f16;
+}
+
bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
// SI has some legal vector types, but no legal vector operations. Say no
// shuffles are legal in order to prefer scalarizing some vector operations.
@@ -560,9 +698,55 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
unsigned IntrID) const {
+ if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
+ AMDGPU::lookupRsrcIntrinsic(IntrID)) {
+ AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
+ (Intrinsic::ID)IntrID);
+ if (Attr.hasFnAttribute(Attribute::ReadNone))
+ return false;
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ if (RsrcIntr->IsImage) {
+ Info.ptrVal = MFI->getImagePSV(
+ *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
+ CI.getArgOperand(RsrcIntr->RsrcArg));
+ Info.align = 0;
+ } else {
+ Info.ptrVal = MFI->getBufferPSV(
+ *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
+ CI.getArgOperand(RsrcIntr->RsrcArg));
+ }
+
+ Info.flags = MachineMemOperand::MODereferenceable;
+ if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.flags |= MachineMemOperand::MOLoad;
+ } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
+ Info.flags |= MachineMemOperand::MOStore;
+ } else {
+ // Atomic
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable;
+
+ // XXX - Should this be volatile without known ordering?
+ Info.flags |= MachineMemOperand::MOVolatile;
+ }
+ return true;
+ }
+
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec: {
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
@@ -575,6 +759,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
+
default:
return false;
}
@@ -585,7 +770,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
Type *&AccessTy) const {
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec: {
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
Value *Ptr = II->getArgOperand(0);
AccessTy = II->getType();
Ops.push_back(Ptr);
@@ -675,7 +863,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (AS == AMDGPUASI.GLOBAL_ADDRESS)
return isLegalGlobalAddressingMode(AM);
- if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@@ -686,19 +875,19 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// will use a MUBUF load.
// FIXME?: We also need to do this if unaligned, but we don't know the
// alignment here.
- if (DL.getTypeStoreSize(Ty) < 4)
+ if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
return isLegalGlobalAddressingMode(AM);
- if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
// SMRD instructions have an 8-bit, dword offset on SI.
if (!isUInt<8>(AM.BaseOffs / 4))
return false;
- } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
+ } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
// On CI+, this can also be a 32-bit literal constant offset. If it fits
// in 8-bits, it can use a smaller encoding.
if (!isUInt<32>(AM.BaseOffs / 4))
return false;
- } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
if (!isUInt<20>(AM.BaseOffs))
return false;
@@ -798,7 +987,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
// If we have an uniform constant load, it still requires using a slow
// buffer instruction if unaligned.
if (IsFast) {
- *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
+ *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
+ AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
(Align % 4 == 0) : true;
}
@@ -841,7 +1031,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
return AS == AMDGPUASI.GLOBAL_ADDRESS ||
AS == AMDGPUASI.FLAT_ADDRESS ||
- AS == AMDGPUASI.CONSTANT_ADDRESS;
+ AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
}
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -853,7 +1044,7 @@ bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
const Value *Ptr = MemNode->getMemOperand()->getValue();
- const Instruction *I = dyn_cast<Instruction>(Ptr);
+ const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
return I && I->getMetadata("amdgpu.noclobber");
}
@@ -870,7 +1061,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
- return AMDGPU::isUniformMMO(MemNode->getMemOperand());
+ return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
}
TargetLoweringBase::LegalizeTypeAction
@@ -932,14 +1123,13 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
- return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
- DAG.getConstant(Offset, SL, PtrVT));
+ return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
}
SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
const SDLoc &SL) const {
- auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
- uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+ uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
+ FIRST_IMPLICIT);
return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
}
@@ -966,18 +1156,42 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
SDValue SITargetLowering::lowerKernargMemParameter(
SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
- uint64_t Offset, bool Signed,
+ uint64_t Offset, unsigned Align, bool Signed,
const ISD::InputArg *Arg) const {
- const DataLayout &DL = DAG.getDataLayout();
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
- unsigned Align = DL.getABITypeAlignment(Ty);
+ // Try to avoid using an extload by loading earlier than the argument address,
+ // and extracting the relevant bits. The load should hopefully be merged with
+ // the previous argument.
+ if (MemVT.getStoreSize() < 4 && Align < 4) {
+ // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
+ int64_t AlignDownOffset = alignDown(Offset, 4);
+ int64_t OffsetDiff = Offset - AlignDownOffset;
+
+ EVT IntVT = MemVT.changeTypeToInteger();
+
+ // TODO: If we passed in the base kernel offset we could have a better
+ // alignment than 4, but we don't really need it.
+ SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
+ SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+
+ SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
+ SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
+
+ SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
+ ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
+ ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
+
+
+ return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
+ }
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
- MachineMemOperand::MONonTemporal |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
@@ -1052,36 +1266,51 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
FunctionType *FType,
SIMachineFunctionInfo *Info) {
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
- const ISD::InputArg &Arg = Ins[I];
+ const ISD::InputArg *Arg = &Ins[I];
// First check if it's a PS input addr.
- if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
- !Arg.Flags.isByVal() && PSInputNum <= 15) {
+ if (CallConv == CallingConv::AMDGPU_PS &&
+ !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
+
+ bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
+
+ // Inconveniently only the first part of the split is marked as isSplit,
+ // so skip to the end. We only want to increment PSInputNum once for the
+ // entire split argument.
+ if (Arg->Flags.isSplit()) {
+ while (!Arg->Flags.isSplitEnd()) {
+ assert(!Arg->VT.isVector() &&
+ "unexpected vector split in ps argument type");
+ if (!SkipArg)
+ Splits.push_back(*Arg);
+ Arg = &Ins[++I];
+ }
+ }
- if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
+ if (SkipArg) {
// We can safely skip PS inputs.
- Skipped.set(I);
+ Skipped.set(Arg->getOrigArgIndex());
++PSInputNum;
continue;
}
Info->markPSInputAllocated(PSInputNum);
- if (Arg.Used)
+ if (Arg->Used)
Info->markPSInputEnabled(PSInputNum);
++PSInputNum;
}
// Second split vertices into their elements.
- if (Arg.VT.isVector()) {
- ISD::InputArg NewArg = Arg;
+ if (Arg->VT.isVector()) {
+ ISD::InputArg NewArg = *Arg;
NewArg.Flags.setSplit();
- NewArg.VT = Arg.VT.getVectorElementType();
+ NewArg.VT = Arg->VT.getVectorElementType();
// We REALLY want the ORIGINAL number of vertex elements here, e.g. a
// three or five element vertex only needs three or five registers,
// NOT four or eight.
- Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+ Type *ParamType = FType->getParamType(Arg->getOrigArgIndex());
unsigned NumElements = ParamType->getVectorNumElements();
for (unsigned J = 0; J != NumElements; ++J) {
@@ -1089,7 +1318,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
NewArg.PartOffset += NewArg.VT.getStoreSize();
}
} else {
- Splits.push_back(Arg);
+ Splits.push_back(*Arg);
}
}
}
@@ -1347,8 +1576,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// the scratch registers to pass in.
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- if (ST.isAmdCodeObjectV2(MF)) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (ST.isAmdCodeObjectV2(MF.getFunction())) {
if (RequiresStackAccess) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
@@ -1460,12 +1689,12 @@ SDValue SITargetLowering::LowerFormalArguments(
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
MachineFunction &MF = DAG.getMachineFunction();
+ const Function &Fn = MF.getFunction();
FunctionType *FType = MF.getFunction().getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
- const Function &Fn = MF.getFunction();
DiagnosticInfoUnsupported NoGraphicsHSA(
Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
DAG.getContext()->diagnose(NoGraphicsHSA);
@@ -1562,9 +1791,16 @@ SDValue SITargetLowering::LowerFormalArguments(
SmallVector<SDValue, 16> Chains;
- for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+ // FIXME: This is the minimum kernel argument alignment. We should improve
+ // this to the maximum alignment of the arguments.
+ //
+ // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
+ // kern arg offset.
+ const unsigned KernelArgBaseAlign = 16;
+
+ for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
const ISD::InputArg &Arg = Ins[i];
- if (Skipped[i]) {
+ if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
InVals.push_back(DAG.getUNDEF(Arg.VT));
continue;
}
@@ -1576,19 +1812,16 @@ SDValue SITargetLowering::LowerFormalArguments(
VT = Ins[i].VT;
EVT MemVT = VA.getLocVT();
- const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
- VA.getLocMemOffset();
- Info->setABIArgOffset(Offset + MemVT.getStoreSize());
+ const uint64_t Offset = VA.getLocMemOffset();
+ unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
- // The first 36 bytes of the input buffer contains information about
- // thread group and global sizes.
SDValue Arg = lowerKernargMemParameter(
- DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
+ DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
Chains.push_back(Arg.getValue(1));
auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
- if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
// On SI local pointers are just offsets into LDS, so they are always
// less than 16-bits. On CI and newer they could potentially be
@@ -1696,7 +1929,7 @@ SDValue SITargetLowering::LowerFormalArguments(
auto &ArgUsageInfo =
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
- ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo());
+ ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
unsigned StackArgSize = CCInfo.getNextStackOffset();
Info->setBytesInStackArgArea(StackArgSize);
@@ -1841,8 +2074,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// FIXME: Does sret work properly?
if (!Info->isEntryFunction()) {
- const SIRegisterInfo *TRI
- = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
if (I) {
@@ -1944,8 +2176,7 @@ void SITargetLowering::passSpecialInputs(
SelectionDAG &DAG = CLI.DAG;
const SDLoc &DL = CLI.DL;
- const SISubtarget *ST = getSubtarget();
- const SIRegisterInfo *TRI = ST->getRegisterInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
auto &ArgUsageInfo =
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
@@ -2138,6 +2369,13 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
"unsupported required tail call to function ");
}
+ if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
+ // Note the issue is with the CC of the calling function, not of the call
+ // itself.
+ return lowerUnhandledCall(CLI, InVals,
+ "unsupported call from graphics shader of function ");
+ }
+
// The first 4 bytes are reserved for the callee's emergency stack slot.
const unsigned CalleeUsableStackOffset = 4;
@@ -2383,7 +2621,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// Add a register mask operand representing the call-preserved registers.
- const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+ auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
@@ -2443,7 +2681,7 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
}
- if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
report_fatal_error(Twine("invalid register \""
+ StringRef(RegName) + "\" for subtarget."));
@@ -2517,7 +2755,8 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
unsigned PhiReg,
unsigned InitSaveExecReg,
int Offset,
- bool UseGPRIdxMode) {
+ bool UseGPRIdxMode,
+ bool IsIndirectSrc) {
MachineBasicBlock::iterator I = LoopBB.begin();
unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -2546,6 +2785,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addReg(CurrentIdxReg)
.addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
+ // Update EXEC, save the original EXEC value to VCC.
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
+ .addReg(CondReg, RegState::Kill);
+
+ MRI.setSimpleHint(NewExec, CondReg);
+
if (UseGPRIdxMode) {
unsigned IdxReg;
if (Offset == 0) {
@@ -2556,11 +2801,13 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addReg(CurrentIdxReg, RegState::Kill)
.addImm(Offset);
}
-
- MachineInstr *SetIdx =
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
- .addReg(IdxReg, RegState::Kill);
- SetIdx->getOperand(2).setIsUndef();
+ unsigned IdxMode = IsIndirectSrc ?
+ VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+ MachineInstr *SetOn =
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addReg(IdxReg, RegState::Kill)
+ .addImm(IdxMode);
+ SetOn->getOperand(3).setIsUndef();
} else {
// Move index from VCC into M0
if (Offset == 0) {
@@ -2573,12 +2820,6 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
}
}
- // Update EXEC, save the original EXEC value to VCC.
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
- .addReg(CondReg, RegState::Kill);
-
- MRI.setSimpleHint(NewExec, CondReg);
-
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
MachineInstr *InsertPt =
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
@@ -2606,7 +2847,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
unsigned InitResultReg,
unsigned PhiReg,
int Offset,
- bool UseGPRIdxMode) {
+ bool UseGPRIdxMode,
+ bool IsIndirectSrc) {
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
@@ -2645,7 +2887,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
InitResultReg, DstReg, PhiReg, TmpExec,
- Offset, UseGPRIdxMode);
+ Offset, UseGPRIdxMode, IsIndirectSrc);
MachineBasicBlock::iterator First = RemainderBB->begin();
BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
@@ -2730,7 +2972,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
// Control flow needs to be inserted if indexing with a VGPR.
static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
MachineBasicBlock &MBB,
- const SISubtarget &ST) {
+ const GCNSubtarget &ST) {
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineFunction *MF = MBB.getParent();
@@ -2780,17 +3022,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
- if (UseGPRIdxMode) {
- MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
- .addImm(0) // Reset inside loop.
- .addImm(VGPRIndexMode::SRC0_ENABLE);
- SetOn->getOperand(3).setIsUndef();
-
- // Disable again after the loop.
- BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
- }
-
- auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
+ Offset, UseGPRIdxMode, true);
MachineBasicBlock *LoopBB = InsPt->getParent();
if (UseGPRIdxMode) {
@@ -2798,6 +3031,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
.addReg(SrcReg, RegState::Undef, SubReg)
.addReg(SrcReg, RegState::Implicit)
.addReg(AMDGPU::M0, RegState::Implicit);
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
} else {
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
.addReg(SrcReg, RegState::Undef, SubReg)
@@ -2829,7 +3063,7 @@ static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
MachineBasicBlock &MBB,
- const SISubtarget &ST) {
+ const GCNSubtarget &ST) {
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineFunction *MF = MBB.getParent();
@@ -2898,22 +3132,10 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
const DebugLoc &DL = MI.getDebugLoc();
- if (UseGPRIdxMode) {
- MachineBasicBlock::iterator I(&MI);
-
- MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
- .addImm(0) // Reset inside loop.
- .addImm(VGPRIndexMode::DST_ENABLE);
- SetOn->getOperand(3).setIsUndef();
-
- // Disable again after the loop.
- BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
- }
-
unsigned PhiReg = MRI.createVirtualRegister(VecRC);
auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
- Offset, UseGPRIdxMode);
+ Offset, UseGPRIdxMode, false);
MachineBasicBlock *LoopBB = InsPt->getParent();
if (UseGPRIdxMode) {
@@ -2923,6 +3145,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
.addReg(Dst, RegState::ImplicitDefine)
.addReg(PhiReg, RegState::Implicit)
.addReg(AMDGPU::M0, RegState::Implicit);
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
} else {
const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
@@ -2946,24 +3169,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
if (TII->isMIMG(MI)) {
- if (!MI.memoperands_empty())
- return BB;
+ if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
+ report_fatal_error("missing mem operand from MIMG instruction");
+ }
// Add a memoperand for mimg instructions so that they aren't assumed to
// be ordered memory instuctions.
- MachinePointerInfo PtrInfo(MFI->getImagePSV());
- MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable;
- if (MI.mayStore())
- Flags |= MachineMemOperand::MOStore;
-
- if (MI.mayLoad())
- Flags |= MachineMemOperand::MOLoad;
-
- if (Flags != MachineMemOperand::MODereferenceable) {
- auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
- MI.addMemOperand(*MF, MMO);
- }
-
return BB;
}
@@ -3145,8 +3356,13 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::ADJCALLSTACKDOWN: {
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
MachineInstrBuilder MIB(*MF, &MI);
+
+ // Add an implicit use of the frame offset reg to prevent the restore copy
+ // inserted after the call from being reorderd after stack operations in the
+ // the caller's frame.
MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
- .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
+ .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
+ .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
return BB;
}
case AMDGPU::SI_CALL_ISEL:
@@ -3236,12 +3452,17 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
VT = VT.getScalarType();
switch (VT.getSimpleVT().SimpleTy) {
- case MVT::f32:
+ case MVT::f32: {
// This is as fast on some subtargets. However, we always have full rate f32
// mad available which returns the same result as the separate operations
// which we should prefer over fma. We can't use this if we want to support
// denormals, so only report this in these cases.
- return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
+ if (Subtarget->hasFP32Denormals())
+ return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
+
+ // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
+ return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
+ }
case MVT::f64:
return true;
case MVT::f16:
@@ -3257,6 +3478,49 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
// Custom DAG Lowering Operations
//===----------------------------------------------------------------------===//
+// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
+// wider vector type is legal.
+SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::v4f16);
+
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
+
+ SDLoc SL(Op);
+ SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
+ Op->getFlags());
+ SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
+ Op->getFlags());
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
+// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
+// wider vector type is legal.
+SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+
+ SDValue Lo0, Hi0;
+ std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
+ SDValue Lo1, Hi1;
+ std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
+
+ SDLoc SL(Op);
+
+ SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
+ Op->getFlags());
+ SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
+ Op->getFlags());
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
@@ -3289,15 +3553,105 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::BUILD_VECTOR:
+ return lowerBUILD_VECTOR(Op, DAG);
case ISD::FP_ROUND:
return lowerFP_ROUND(Op, DAG);
case ISD::TRAP:
- case ISD::DEBUGTRAP:
return lowerTRAP(Op, DAG);
+ case ISD::DEBUGTRAP:
+ return lowerDEBUGTRAP(Op, DAG);
+ case ISD::FABS:
+ case ISD::FNEG:
+ return splitUnaryVectorOp(Op, DAG);
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ case ISD::UMIN:
+ case ISD::UMAX:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case ISD::FADD:
+ case ISD::FMUL:
+ return splitBinaryVectorOp(Op, DAG);
}
return SDValue();
}
+static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
+ const SDLoc &DL,
+ SelectionDAG &DAG, bool Unpacked) {
+ if (!LoadVT.isVector())
+ return Result;
+
+ if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
+ // Truncate to v2i16/v4i16.
+ EVT IntLoadVT = LoadVT.changeTypeToInteger();
+
+ // Workaround legalizer not scalarizing truncate after vector op
+ // legalization byt not creating intermediate vector trunc.
+ SmallVector<SDValue, 4> Elts;
+ DAG.ExtractVectorElements(Result, Elts);
+ for (SDValue &Elt : Elts)
+ Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
+
+ Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
+
+ // Bitcast to original type (v2f16/v4f16).
+ return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+ }
+
+ // Cast back to the original packed type.
+ return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+}
+
+SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
+ MemSDNode *M,
+ SelectionDAG &DAG,
+ bool IsIntrinsic) const {
+ SDLoc DL(M);
+ SmallVector<SDValue, 10> Ops;
+ Ops.reserve(M->getNumOperands());
+
+ Ops.push_back(M->getOperand(0));
+ if (IsIntrinsic)
+ Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32));
+
+ // Skip 1, as it is the intrinsic ID.
+ for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I)
+ Ops.push_back(M->getOperand(I));
+
+ bool Unpacked = Subtarget->hasUnpackedD16VMem();
+ EVT LoadVT = M->getValueType(0);
+
+ EVT EquivLoadVT = LoadVT;
+ if (Unpacked && LoadVT.isVector()) {
+ EquivLoadVT = LoadVT.isVector() ?
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ LoadVT.getVectorNumElements()) : LoadVT;
+ }
+
+ // Change from v4f16/v2f16 to EquivLoadVT.
+ SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
+
+ SDValue Load
+ = DAG.getMemIntrinsicNode(
+ IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
+ VTList, Ops, M->getMemoryVT(),
+ M->getMemOperand());
+ if (!Unpacked) // Just adjusted the opcode.
+ return Load;
+
+ SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
+
+ return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
+}
+
void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
@@ -3314,7 +3668,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
- if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
+ switch (IID) {
+ case Intrinsic::amdgcn_cvt_pkrtz: {
SDValue Src0 = N->getOperand(1);
SDValue Src1 = N->getOperand(2);
SDLoc SL(N);
@@ -3323,6 +3678,38 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
return;
}
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ SDValue Src0 = N->getOperand(1);
+ SDValue Src1 = N->getOperand(2);
+ SDLoc SL(N);
+ unsigned Opcode;
+
+ if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
+ Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+ else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
+ Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+ else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
+ Opcode = AMDGPUISD::CVT_PK_I16_I32;
+ else
+ Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+ SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
+ return;
+ }
+ }
+ break;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ return;
+ }
+
break;
}
case ISD::SELECT: {
@@ -3347,12 +3734,38 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
return;
}
+ case ISD::FNEG: {
+ if (N->getValueType(0) != MVT::v2f16)
+ break;
+
+ SDLoc SL(N);
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
+
+ SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
+ BC,
+ DAG.getConstant(0x80008000, SL, MVT::i32));
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
+ return;
+ }
+ case ISD::FABS: {
+ if (N->getValueType(0) != MVT::v2f16)
+ break;
+
+ SDLoc SL(N);
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
+
+ SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
+ BC,
+ DAG.getConstant(0x7fff7fff, SL, MVT::i32));
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
+ return;
+ }
default:
break;
}
}
-/// \brief Helper function for LowerBRCOND
+/// Helper function for LowerBRCOND
static SDNode *findUser(SDValue Value, unsigned Opcode) {
SDNode *Parent = Value.getNode();
@@ -3417,13 +3830,15 @@ void SITargetLowering::createDebuggerPrologueStackObjects(
bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
const Triple &TT = getTargetMachine().getTargetTriple();
- return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+ return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+ GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
AMDGPU::shouldEmitConstantsToTextSection(TT);
}
bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
- GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
+ GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+ GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
!shouldEmitFixup(GV) &&
!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
}
@@ -3560,40 +3975,37 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
- MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = Op.getOperand(0);
- unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
- SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;
-
- if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
- Subtarget->isTrapHandlerEnabled()) {
- SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- unsigned UserSGPR = Info->getQueuePtrUserSGPR();
- assert(UserSGPR != AMDGPU::NoRegister);
-
- SDValue QueuePtr = CreateLiveInRegister(
- DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
-
- SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
-
- SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
- QueuePtr, SDValue());
+ if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+ !Subtarget->isTrapHandlerEnabled())
+ return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
- SDValue Ops[] = {
- ToReg,
- DAG.getTargetConstant(TrapID, SL, MVT::i16),
- SGPR01,
- ToReg.getValue(1)
- };
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+ assert(UserSGPR != AMDGPU::NoRegister);
+ SDValue QueuePtr = CreateLiveInRegister(
+ DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+ SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
+ SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
+ QueuePtr, SDValue());
+ SDValue Ops[] = {
+ ToReg,
+ DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
+ SGPR01,
+ ToReg.getValue(1)
+ };
+ return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
+}
- return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
- }
+SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Chain = Op.getOperand(0);
+ MachineFunction &MF = DAG.getMachineFunction();
- switch (TrapID) {
- case SISubtarget::TrapIDLLVMTrap:
- return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
- case SISubtarget::TrapIDLLVMDebugTrap: {
+ if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+ !Subtarget->isTrapHandlerEnabled()) {
DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
"debugtrap handler not supported",
Op.getDebugLoc(),
@@ -3602,11 +4014,12 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
Ctx.diagnose(NoTrap);
return Chain;
}
- default:
- llvm_unreachable("unsupported trap handler type!");
- }
- return Chain;
+ SDValue Ops[] = {
+ Chain,
+ DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
+ };
+ return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}
SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
@@ -3719,34 +4132,78 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
+ SDValue Vec = Op.getOperand(0);
+ SDValue InsVal = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
+ EVT VecVT = Vec.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+ unsigned VecSize = VecVT.getSizeInBits();
+ unsigned EltSize = EltVT.getSizeInBits();
+
+
+ assert(VecSize <= 64);
+
+ unsigned NumElts = VecVT.getVectorNumElements();
+ SDLoc SL(Op);
+ auto KIdx = dyn_cast<ConstantSDNode>(Idx);
+
+ if (NumElts == 4 && EltSize == 16 && KIdx) {
+ SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
+
+ SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
+ DAG.getConstant(0, SL, MVT::i32));
+ SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
+ DAG.getConstant(1, SL, MVT::i32));
+
+ SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
+ SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
+
+ unsigned Idx = KIdx->getZExtValue();
+ bool InsertLo = Idx < 2;
+ SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
+ InsertLo ? LoVec : HiVec,
+ DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
+ DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
+
+ InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
+
+ SDValue Concat = InsertLo ?
+ DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
+ DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
+
+ return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
+ }
+
if (isa<ConstantSDNode>(Idx))
return SDValue();
+ MVT IntVT = MVT::getIntegerVT(VecSize);
+
// Avoid stack access for dynamic indexing.
- SDLoc SL(Op);
- SDValue Vec = Op.getOperand(0);
- SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
+ SDValue Val = InsVal;
+ if (InsVal.getValueType() == MVT::f16)
+ Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
- SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
+ SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
- // Convert vector index to bit-index.
- SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
- DAG.getConstant(16, SL, MVT::i32));
+ assert(isPowerOf2_32(EltSize));
+ SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
- SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+ // Convert vector index to bit-index.
+ SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
- SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
- DAG.getConstant(0xffff, SL, MVT::i32),
+ SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
+ SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
+ DAG.getConstant(0xffff, SL, IntVT),
ScaledIdx);
- SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
- SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
- DAG.getNOT(SL, BFM, MVT::i32), BCVec);
+ SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
+ SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
+ DAG.getNOT(SL, BFM, IntVT), BCVec);
- SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
- return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
+ SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
+ return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
}
SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
@@ -3756,51 +4213,87 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
EVT ResultVT = Op.getValueType();
SDValue Vec = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);
+ EVT VecVT = Vec.getValueType();
+ unsigned VecSize = VecVT.getSizeInBits();
+ EVT EltVT = VecVT.getVectorElementType();
+ assert(VecSize <= 64);
DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
- // Make sure we we do any optimizations that will make it easier to fold
+ // Make sure we do any optimizations that will make it easier to fold
// source modifiers before obscuring it with bit operations.
// XXX - Why doesn't this get called when vector_shuffle is expanded?
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;
- if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
- SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+ unsigned EltSize = EltVT.getSizeInBits();
+ assert(isPowerOf2_32(EltSize));
- if (CIdx->getZExtValue() == 1) {
- Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
- DAG.getConstant(16, SL, MVT::i32));
- } else {
- assert(CIdx->getZExtValue() == 0);
- }
+ MVT IntVT = MVT::getIntegerVT(VecSize);
+ SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
+
+ // Convert vector index to bit-index (* EltSize)
+ SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
- if (ResultVT.bitsLT(MVT::i32))
- Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
+ SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
+
+ if (ResultVT == MVT::f16) {
+ SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
}
- SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
+ return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
+}
- // Convert vector index to bit-index.
- SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
+SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ EVT VT = Op.getValueType();
+
+ if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
+
+ // Turn into pair of packed build_vectors.
+ // TODO: Special case for constants that can be materialized with s_mov_b64.
+ SDValue Lo = DAG.getBuildVector(HalfVT, SL,
+ { Op.getOperand(0), Op.getOperand(1) });
+ SDValue Hi = DAG.getBuildVector(HalfVT, SL,
+ { Op.getOperand(2), Op.getOperand(3) });
+
+ SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
+ SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
+
+ SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
+ return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+ }
- SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
- SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
+ assert(VT == MVT::v2f16 || VT == MVT::v2i16);
- SDValue Result = Elt;
- if (ResultVT.bitsLT(MVT::i32))
- Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
- return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
+ Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
+ Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
+
+ Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
+ Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
+
+ SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
+ DAG.getConstant(16, SL, MVT::i32));
+
+ SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
+
+ return DAG.getNode(ISD::BITCAST, SL, VT, Or);
}
bool
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// We can fold offsets for anything that doesn't require a GOT relocation.
return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
- GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
+ GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+ GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
!shouldEmitGOTReloc(GA->getGlobal());
}
@@ -3853,6 +4346,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
const GlobalValue *GV = GSD->getGlobal();
if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+ GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
// FIXME: It isn't correct to rely on the type of the pointer. This should
// be removed when address space 0 is 64-bit.
@@ -3905,7 +4399,7 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
unsigned Offset) const {
SDLoc SL(Op);
SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
- DAG.getEntryNode(), Offset, false);
+ DAG.getEntryNode(), Offset, 4, false);
// The local size values will have the hi 16-bits as zero.
return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
DAG.getValueType(VT));
@@ -3929,6 +4423,245 @@ static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
return DAG.getUNDEF(VT);
}
+static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
+ ArrayRef<SDValue> Elts) {
+ assert(!Elts.empty());
+ MVT Type;
+ unsigned NumElts;
+
+ if (Elts.size() == 1) {
+ Type = MVT::f32;
+ NumElts = 1;
+ } else if (Elts.size() == 2) {
+ Type = MVT::v2f32;
+ NumElts = 2;
+ } else if (Elts.size() <= 4) {
+ Type = MVT::v4f32;
+ NumElts = 4;
+ } else if (Elts.size() <= 8) {
+ Type = MVT::v8f32;
+ NumElts = 8;
+ } else {
+ assert(Elts.size() <= 16);
+ Type = MVT::v16f32;
+ NumElts = 16;
+ }
+
+ SmallVector<SDValue, 16> VecElts(NumElts);
+ for (unsigned i = 0; i < Elts.size(); ++i) {
+ SDValue Elt = Elts[i];
+ if (Elt.getValueType() != MVT::f32)
+ Elt = DAG.getBitcast(MVT::f32, Elt);
+ VecElts[i] = Elt;
+ }
+ for (unsigned i = Elts.size(); i < NumElts; ++i)
+ VecElts[i] = DAG.getUNDEF(MVT::f32);
+
+ if (NumElts == 1)
+ return VecElts[0];
+ return DAG.getBuildVector(Type, DL, VecElts);
+}
+
+static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
+ SDValue *GLC, SDValue *SLC) {
+ auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
+ if (!CachePolicyConst)
+ return false;
+
+ uint64_t Value = CachePolicyConst->getZExtValue();
+ SDLoc DL(CachePolicy);
+ if (GLC) {
+ *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
+ Value &= ~(uint64_t)0x1;
+ }
+ if (SLC) {
+ *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
+ Value &= ~(uint64_t)0x2;
+ }
+
+ return Value == 0;
+}
+
+SDValue SITargetLowering::lowerImage(SDValue Op,
+ const AMDGPU::ImageDimIntrinsicInfo *Intr,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MachineFunction &MF = DAG.getMachineFunction();
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
+
+ SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
+ bool IsD16 = false;
+ SDValue VData;
+ int NumVDataDwords;
+ unsigned AddrIdx; // Index of first address argument
+ unsigned DMask;
+
+ if (BaseOpcode->Atomic) {
+ VData = Op.getOperand(2);
+
+ bool Is64Bit = VData.getValueType() == MVT::i64;
+ if (BaseOpcode->AtomicX2) {
+ SDValue VData2 = Op.getOperand(3);
+ VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
+ {VData, VData2});
+ if (Is64Bit)
+ VData = DAG.getBitcast(MVT::v4i32, VData);
+
+ ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+ DMask = Is64Bit ? 0xf : 0x3;
+ NumVDataDwords = Is64Bit ? 4 : 2;
+ AddrIdx = 4;
+ } else {
+ DMask = Is64Bit ? 0x3 : 0x1;
+ NumVDataDwords = Is64Bit ? 2 : 1;
+ AddrIdx = 3;
+ }
+ } else {
+ unsigned DMaskIdx;
+
+ if (BaseOpcode->Store) {
+ VData = Op.getOperand(2);
+
+ MVT StoreVT = VData.getSimpleValueType();
+ if (StoreVT.getScalarType() == MVT::f16) {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
+ !BaseOpcode->HasD16)
+ return Op; // D16 is unsupported for this instruction
+
+ IsD16 = true;
+ VData = handleD16VData(VData, DAG);
+ }
+
+ NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
+ DMaskIdx = 3;
+ } else {
+ MVT LoadVT = Op.getSimpleValueType();
+ if (LoadVT.getScalarType() == MVT::f16) {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
+ !BaseOpcode->HasD16)
+ return Op; // D16 is unsupported for this instruction
+
+ IsD16 = true;
+ if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
+ ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
+ }
+
+ NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
+ DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
+ }
+
+ auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
+ if (!DMaskConst)
+ return Op;
+
+ AddrIdx = DMaskIdx + 1;
+ DMask = DMaskConst->getZExtValue();
+ if (!DMask && !BaseOpcode->Store) {
+ // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
+ // store the channels' default values.
+ SDValue Undef = DAG.getUNDEF(Op.getValueType());
+ if (isa<MemSDNode>(Op))
+ return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
+ return Undef;
+ }
+ }
+
+ unsigned NumVAddrs = BaseOpcode->NumExtraArgs +
+ (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) +
+ (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ SmallVector<SDValue, 4> VAddrs;
+ for (unsigned i = 0; i < NumVAddrs; ++i)
+ VAddrs.push_back(Op.getOperand(AddrIdx + i));
+ SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
+
+ SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
+ SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
+ unsigned CtrlIdx; // Index of texfailctrl argument
+ SDValue Unorm;
+ if (!BaseOpcode->Sampler) {
+ Unorm = True;
+ CtrlIdx = AddrIdx + NumVAddrs + 1;
+ } else {
+ auto UnormConst =
+ dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
+ if (!UnormConst)
+ return Op;
+
+ Unorm = UnormConst->getZExtValue() ? True : False;
+ CtrlIdx = AddrIdx + NumVAddrs + 3;
+ }
+
+ SDValue TexFail = Op.getOperand(CtrlIdx);
+ auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
+ if (!TexFailConst || TexFailConst->getZExtValue() != 0)
+ return Op;
+
+ SDValue GLC;
+ SDValue SLC;
+ if (BaseOpcode->Atomic) {
+ GLC = True; // TODO no-return optimization
+ if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
+ return Op;
+ } else {
+ if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
+ return Op;
+ }
+
+ SmallVector<SDValue, 14> Ops;
+ if (BaseOpcode->Store || BaseOpcode->Atomic)
+ Ops.push_back(VData); // vdata
+ Ops.push_back(VAddr);
+ Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
+ if (BaseOpcode->Sampler)
+ Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
+ Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
+ Ops.push_back(Unorm);
+ Ops.push_back(GLC);
+ Ops.push_back(SLC);
+ Ops.push_back(False); // r128
+ Ops.push_back(False); // tfe
+ Ops.push_back(False); // lwe
+ Ops.push_back(DimInfo->DA ? True : False);
+ if (BaseOpcode->HasD16)
+ Ops.push_back(IsD16 ? True : False);
+ if (isa<MemSDNode>(Op))
+ Ops.push_back(Op.getOperand(0)); // chain
+
+ int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
+ int Opcode = -1;
+
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx8,
+ NumVDataDwords, NumVAddrDwords);
+ if (Opcode == -1)
+ Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx6,
+ NumVDataDwords, NumVAddrDwords);
+ assert(Opcode != -1);
+
+ MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
+ if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
+ MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
+ *MemRefs = MemOp->getMemOperand();
+ NewNode->setMemRefs(MemRefs, MemRefs + 1);
+ }
+
+ if (BaseOpcode->AtomicX2) {
+ SmallVector<SDValue, 1> Elt;
+ DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
+ return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
+ } else if (IsD16 && !BaseOpcode->Store) {
+ MVT LoadVT = Op.getSimpleValueType();
+ SDValue Adjusted = adjustLoadValueTypeImpl(
+ SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
+ return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
+ }
+
+ return SDValue(NewNode, 0);
+}
+
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
@@ -3942,14 +4675,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
switch (IntrinsicID) {
case Intrinsic::amdgcn_implicit_buffer_ptr: {
- if (getSubtarget()->isAmdCodeObjectV2(MF))
+ if (getSubtarget()->isAmdCodeObjectV2(MF.getFunction()))
return emitNonHSAIntrinsicError(DAG, DL, VT);
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
}
case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr: {
- if (!Subtarget->isAmdCodeObjectV2(MF)) {
+ if (!Subtarget->isAmdCodeObjectV2(MF.getFunction())) {
DiagnosticInfoUnsupported BadIntrin(
MF.getFunction(), "unsupported hsa intrinsic without hsa target",
DL.getDebugLoc());
@@ -3979,16 +4712,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_rsq:
return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_rsq_legacy:
- if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return emitRemovedIntrinsicError(DAG, DL, VT);
return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_rcp_legacy:
- if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return emitRemovedIntrinsicError(DAG, DL, VT);
return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_rsq_clamp: {
- if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
Type *Type = VT.getTypeForEVT(*DAG.getContext());
@@ -4006,37 +4739,37 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::NGROUPS_X, false);
+ SI::KernelInputOffsets::NGROUPS_X, 4, false);
case Intrinsic::r600_read_ngroups_y:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::NGROUPS_Y, false);
+ SI::KernelInputOffsets::NGROUPS_Y, 4, false);
case Intrinsic::r600_read_ngroups_z:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::NGROUPS_Z, false);
+ SI::KernelInputOffsets::NGROUPS_Z, 4, false);
case Intrinsic::r600_read_global_size_x:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+ SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
case Intrinsic::r600_read_global_size_y:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+ SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
case Intrinsic::r600_read_global_size_z:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+ SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
case Intrinsic::r600_read_local_size_x:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
@@ -4125,7 +4858,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_log_clamp: {
- if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return SDValue();
DiagnosticInfoUnsupported BadIntrin(
@@ -4210,6 +4943,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_fmed3:
return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::amdgcn_fdot2:
+ return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_fmul_legacy:
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
Op.getOperand(1), Op.getOperand(2));
@@ -4221,10 +4957,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_ubfe:
return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
- case Intrinsic::amdgcn_cvt_pkrtz: {
- // FIXME: Stop adding cast if v2f16 legal.
+ case Intrinsic::amdgcn_cvt_pkrtz:
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ // FIXME: Stop adding cast if v2f16/v2i16 are legal.
EVT VT = Op.getValueType();
- SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
+ unsigned Opcode;
+
+ if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
+ Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
+ Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
+ Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
+ Opcode = AMDGPUISD::CVT_PK_I16_I32;
+ else
+ Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+ SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
}
@@ -4238,17 +4991,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
0);
}
- case Intrinsic::amdgcn_image_getlod:
- case Intrinsic::amdgcn_image_getresinfo: {
- unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4;
-
- // Replace dmask with everything disabled with undef.
- const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx));
- if (!DMask || DMask->isNullValue())
- return DAG.getUNDEF(Op.getValueType());
- return SDValue();
- }
+ case Intrinsic::amdgcn_fmad_ftz:
+ return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3));
default:
+ if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
+ return lowerImage(Op, ImageDimIntr, DAG);
+
return Op;
}
}
@@ -4257,14 +5007,34 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
SDLoc DL(Op);
- MachineFunction &MF = DAG.getMachineFunction();
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec: {
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
- unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
- AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
+ unsigned Opc;
+ switch (IntrID) {
+ case Intrinsic::amdgcn_atomic_inc:
+ Opc = AMDGPUISD::ATOMIC_INC;
+ break;
+ case Intrinsic::amdgcn_atomic_dec:
+ Opc = AMDGPUISD::ATOMIC_DEC;
+ break;
+ case Intrinsic::amdgcn_ds_fadd:
+ Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
+ break;
+ case Intrinsic::amdgcn_ds_fmin:
+ Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
+ break;
+ case Intrinsic::amdgcn_ds_fmax:
+ Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
+ break;
+ default:
+ llvm_unreachable("Unknown intrinsic!");
+ }
SDValue Ops[] = {
M->getOperand(0), // Chain
M->getOperand(2), // Ptr
@@ -4284,21 +5054,28 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(5), // glc
Op.getOperand(6) // slc
};
- SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
EVT VT = Op.getValueType();
EVT IntVT = VT.changeTypeToInteger();
+ auto *M = cast<MemSDNode>(Op);
+ EVT LoadVT = Op.getValueType();
+ bool IsD16 = LoadVT.getScalarType() == MVT::f16;
+ if (IsD16)
+ return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG);
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(MFI->getBufferPSV()),
- MachineMemOperand::MOLoad,
- VT.getStoreSize(), VT.getStoreSize());
-
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+ M->getMemOperand());
}
case Intrinsic::amdgcn_tbuffer_load: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ EVT LoadVT = Op.getValueType();
+ bool IsD16 = LoadVT.getScalarType() == MVT::f16;
+ if (IsD16) {
+ return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG);
+ }
+
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
@@ -4312,14 +5089,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(10) // slc
};
- EVT VT = Op.getOperand(2).getValueType();
-
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad,
- VT.getStoreSize(), VT.getStoreSize());
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
- Op->getVTList(), Ops, VT, MMO);
+ Op->getVTList(), Ops, LoadVT,
+ M->getMemOperand());
}
case Intrinsic::amdgcn_buffer_atomic_swap:
case Intrinsic::amdgcn_buffer_atomic_add:
@@ -4339,14 +5111,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(5), // offset
Op.getOperand(6) // slc
};
- EVT VT = Op.getOperand(3).getValueType();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOVolatile,
- VT.getStoreSize(), 4);
+ EVT VT = Op.getValueType();
+
+ auto *M = cast<MemSDNode>(Op);
unsigned Opcode = 0;
switch (IntrID) {
@@ -4384,7 +5151,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
llvm_unreachable("unhandled atomic opcode");
}
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
+ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
+ M->getMemOperand());
}
case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
@@ -4397,78 +5165,46 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(6), // offset
Op.getOperand(7) // slc
};
- EVT VT = Op.getOperand(4).getValueType();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOVolatile,
- VT.getStoreSize(), 4);
+ EVT VT = Op.getValueType();
+ auto *M = cast<MemSDNode>(Op);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
- Op->getVTList(), Ops, VT, MMO);
+ Op->getVTList(), Ops, VT, M->getMemOperand());
}
- // Basic sample.
- case Intrinsic::amdgcn_image_sample:
- case Intrinsic::amdgcn_image_sample_cl:
- case Intrinsic::amdgcn_image_sample_d:
- case Intrinsic::amdgcn_image_sample_d_cl:
- case Intrinsic::amdgcn_image_sample_l:
- case Intrinsic::amdgcn_image_sample_b:
- case Intrinsic::amdgcn_image_sample_b_cl:
- case Intrinsic::amdgcn_image_sample_lz:
- case Intrinsic::amdgcn_image_sample_cd:
- case Intrinsic::amdgcn_image_sample_cd_cl:
-
- // Sample with comparison.
- case Intrinsic::amdgcn_image_sample_c:
- case Intrinsic::amdgcn_image_sample_c_cl:
- case Intrinsic::amdgcn_image_sample_c_d:
- case Intrinsic::amdgcn_image_sample_c_d_cl:
- case Intrinsic::amdgcn_image_sample_c_l:
- case Intrinsic::amdgcn_image_sample_c_b:
- case Intrinsic::amdgcn_image_sample_c_b_cl:
- case Intrinsic::amdgcn_image_sample_c_lz:
- case Intrinsic::amdgcn_image_sample_c_cd:
- case Intrinsic::amdgcn_image_sample_c_cd_cl:
-
- // Sample with offsets.
- case Intrinsic::amdgcn_image_sample_o:
- case Intrinsic::amdgcn_image_sample_cl_o:
- case Intrinsic::amdgcn_image_sample_d_o:
- case Intrinsic::amdgcn_image_sample_d_cl_o:
- case Intrinsic::amdgcn_image_sample_l_o:
- case Intrinsic::amdgcn_image_sample_b_o:
- case Intrinsic::amdgcn_image_sample_b_cl_o:
- case Intrinsic::amdgcn_image_sample_lz_o:
- case Intrinsic::amdgcn_image_sample_cd_o:
- case Intrinsic::amdgcn_image_sample_cd_cl_o:
-
- // Sample with comparison and offsets.
- case Intrinsic::amdgcn_image_sample_c_o:
- case Intrinsic::amdgcn_image_sample_c_cl_o:
- case Intrinsic::amdgcn_image_sample_c_d_o:
- case Intrinsic::amdgcn_image_sample_c_d_cl_o:
- case Intrinsic::amdgcn_image_sample_c_l_o:
- case Intrinsic::amdgcn_image_sample_c_b_o:
- case Intrinsic::amdgcn_image_sample_c_b_cl_o:
- case Intrinsic::amdgcn_image_sample_c_lz_o:
- case Intrinsic::amdgcn_image_sample_c_cd_o:
- case Intrinsic::amdgcn_image_sample_c_cd_cl_o: {
- // Replace dmask with everything disabled with undef.
- const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
- if (!DMask || DMask->isNullValue()) {
- SDValue Undef = DAG.getUNDEF(Op.getValueType());
- return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
- }
+ default:
+ if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(IntrID))
+ return lowerImage(Op, ImageDimIntr, DAG);
return SDValue();
}
- default:
- return SDValue();
+}
+
+SDValue SITargetLowering::handleD16VData(SDValue VData,
+ SelectionDAG &DAG) const {
+ EVT StoreVT = VData.getValueType();
+
+ // No change for f16 and legal vector D16 types.
+ if (!StoreVT.isVector())
+ return VData;
+
+ SDLoc DL(VData);
+ assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
+
+ if (Subtarget->hasUnpackedD16VMem()) {
+ // We need to unpack the packed data to store.
+ EVT IntStoreVT = StoreVT.changeTypeToInteger();
+ SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
+
+ EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ StoreVT.getVectorNumElements());
+ SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
+ return DAG.UnrollVectorOp(ZExt.getNode());
}
+
+ assert(isTypeLegal(StoreVT));
+ return VData;
}
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
@@ -4558,7 +5294,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
case Intrinsic::amdgcn_s_barrier: {
if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
if (WGSize <= ST.getWavefrontSize())
return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
@@ -4613,9 +5349,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
case Intrinsic::amdgcn_tbuffer_store: {
+ SDValue VData = Op.getOperand(2);
+ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ if (IsD16)
+ VData = handleD16VData(VData, DAG);
SDValue Ops[] = {
Chain,
- Op.getOperand(2), // vdata
+ VData, // vdata
Op.getOperand(3), // rsrc
Op.getOperand(4), // vindex
Op.getOperand(5), // voffset
@@ -4626,42 +5366,133 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(10), // glc
Op.getOperand(11) // slc
};
- EVT VT = Op.getOperand(3).getValueType();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOStore,
- VT.getStoreSize(), 4);
- return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
- Op->getVTList(), Ops, VT, MMO);
+ unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
+ AMDGPUISD::TBUFFER_STORE_FORMAT;
+ MemSDNode *M = cast<MemSDNode>(Op);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
}
case Intrinsic::amdgcn_buffer_store:
case Intrinsic::amdgcn_buffer_store_format: {
+ SDValue VData = Op.getOperand(2);
+ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ if (IsD16)
+ VData = handleD16VData(VData, DAG);
SDValue Ops[] = {
Chain,
- Op.getOperand(2), // vdata
+ VData, // vdata
Op.getOperand(3), // rsrc
Op.getOperand(4), // vindex
Op.getOperand(5), // offset
Op.getOperand(6), // glc
Op.getOperand(7) // slc
};
- EVT VT = Op.getOperand(3).getValueType();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable,
- VT.getStoreSize(), 4);
+ unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
+ AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
+ Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
+ MemSDNode *M = cast<MemSDNode>(Op);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
+ }
+ default: {
+ if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
+ return lowerImage(Op, ImageDimIntr, DAG);
- unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
- AMDGPUISD::BUFFER_STORE :
- AMDGPUISD::BUFFER_STORE_FORMAT;
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
+ return Op;
}
+ }
+}
- default:
+static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
+ ISD::LoadExtType ExtType, SDValue Op,
+ const SDLoc &SL, EVT VT) {
+ if (VT.bitsLT(Op.getValueType()))
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
+
+ switch (ExtType) {
+ case ISD::SEXTLOAD:
+ return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
+ case ISD::ZEXTLOAD:
+ return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
+ case ISD::EXTLOAD:
+ return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
+ case ISD::NON_EXTLOAD:
return Op;
}
+
+ llvm_unreachable("invalid ext type");
+}
+
+SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ if (Ld->getAlignment() < 4 || Ld->isDivergent())
+ return SDValue();
+
+ // FIXME: Constant loads should all be marked invariant.
+ unsigned AS = Ld->getAddressSpace();
+ if (AS != AMDGPUASI.CONSTANT_ADDRESS &&
+ AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
+ (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
+ return SDValue();
+
+ // Don't do this early, since it may interfere with adjacent load merging for
+ // illegal types. We can avoid losing alignment information for exotic types
+ // pre-legalize.
+ EVT MemVT = Ld->getMemoryVT();
+ if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
+ MemVT.getSizeInBits() >= 32)
+ return SDValue();
+
+ SDLoc SL(Ld);
+
+ assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
+ "unexpected vector extload");
+
+ // TODO: Drop only high part of range.
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
+ MVT::i32, SL, Ld->getChain(), Ptr,
+ Ld->getOffset(),
+ Ld->getPointerInfo(), MVT::i32,
+ Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags(),
+ Ld->getAAInfo(),
+ nullptr); // Drop ranges
+
+ EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+ if (MemVT.isFloatingPoint()) {
+ assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
+ "unexpected fp extload");
+ TruncVT = MemVT.changeTypeToInteger();
+ }
+
+ SDValue Cvt = NewLoad;
+ if (Ld->getExtensionType() == ISD::SEXTLOAD) {
+ Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
+ DAG.getValueType(TruncVT));
+ } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
+ Ld->getExtensionType() == ISD::NON_EXTLOAD) {
+ Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
+ } else {
+ assert(Ld->getExtensionType() == ISD::EXTLOAD);
+ }
+
+ EVT VT = Ld->getValueType(0);
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+ DCI.AddToWorklist(Cvt.getNode());
+
+ // We may need to handle exotic cases, such as i16->i64 extloads, so insert
+ // the appropriate extension from the 32-bit load.
+ Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
+ DCI.AddToWorklist(Cvt.getNode());
+
+ // Handle conversion back to floating point if necessary.
+ Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
+
+ return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
}
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
@@ -4700,9 +5531,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
"Custom lowering for non-i32 vectors hasn't been implemented.");
+ unsigned Alignment = Load->getAlignment();
unsigned AS = Load->getAddressSpace();
if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
- AS, Load->getAlignment())) {
+ AS, Alignment)) {
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
return DAG.getMergeValues(Ops, DL);
@@ -4717,24 +5549,32 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
unsigned NumElements = MemVT.getVectorNumElements();
- if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
- if (isMemOpUniform(Load))
+
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
+ if (!Op->isDivergent() && Alignment >= 4)
return SDValue();
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
// loads.
//
}
- if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
- if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
- !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
+
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
+ AS == AMDGPUASI.GLOBAL_ADDRESS) {
+ if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
+ !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
+ Alignment >= 4)
return SDValue();
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
// loads.
//
}
- if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS ||
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
+ AS == AMDGPUASI.GLOBAL_ADDRESS ||
AS == AMDGPUASI.FLAT_ADDRESS) {
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
@@ -4761,21 +5601,20 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
- if (NumElements > 2)
- return SplitVectorLoad(Op, DAG);
-
- if (NumElements == 2)
+ // Use ds_read_b128 if possible.
+ if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
+ MemVT.getStoreSize() == 16)
return SDValue();
- // If properly aligned, if we split we might be able to use ds_read_b64.
- return SplitVectorLoad(Op, DAG);
+ if (NumElements > 2)
+ return SplitVectorLoad(Op, DAG);
}
return SDValue();
}
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
- if (Op.getValueType() != MVT::i64)
- return SDValue();
+ EVT VT = Op.getValueType();
+ assert(VT.getSizeInBits() == 64);
SDLoc DL(Op);
SDValue Cond = Op.getOperand(0);
@@ -4797,7 +5636,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
- return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Res);
}
// Catch division cases where we can use shortcuts with rcp and rsq
@@ -4809,8 +5648,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
SDValue RHS = Op.getOperand(1);
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();
- bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
- Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
+ bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
return SDValue();
@@ -5067,7 +5905,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
SDValue Scale;
- if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
// Workaround a hardware bug on SI where the condition output from div_scale
// is not usable.
@@ -5165,14 +6003,14 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+ // Use ds_write_b128 if possible.
+ if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
+ VT.getStoreSize() == 16)
+ return SDValue();
+
if (NumElements > 2)
return SplitVectorStore(Op, DAG);
-
- if (NumElements == 2)
- return Op;
-
- // If properly aligned, if we split we might be able to use ds_write_b64.
- return SplitVectorStore(Op, DAG);
+ return SDValue();
} else {
llvm_unreachable("unhandled address space");
}
@@ -5246,7 +6084,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
// easier if i8 vectors weren't promoted to i32 vectors, particularly after
// types are legalized. v4i8 -> v4f32 is probably the only case to worry
// about in practice.
- if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
+ if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
DCI.AddToWorklist(Cvt.getNode());
@@ -5389,6 +6227,71 @@ static bool isBoolSGPR(SDValue V) {
return false;
}
+// If a constant has all zeroes or all ones within each byte return it.
+// Otherwise return 0.
+static uint32_t getConstantPermuteMask(uint32_t C) {
+ // 0xff for any zero byte in the mask
+ uint32_t ZeroByteMask = 0;
+ if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
+ if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
+ if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
+ if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
+ uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
+ if ((NonZeroByteMask & C) != NonZeroByteMask)
+ return 0; // Partial bytes selected.
+ return C;
+}
+
+// Check if a node selects whole bytes from its operand 0 starting at a byte
+// boundary while masking the rest. Returns select mask as in the v_perm_b32
+// or -1 if not succeeded.
+// Note byte select encoding:
+// value 0-3 selects corresponding source byte;
+// value 0xc selects zero;
+// value 0xff selects 0xff.
+static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
+ assert(V.getValueSizeInBits() == 32);
+
+ if (V.getNumOperands() != 2)
+ return ~0;
+
+ ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
+ if (!N1)
+ return ~0;
+
+ uint32_t C = N1->getZExtValue();
+
+ switch (V.getOpcode()) {
+ default:
+ break;
+ case ISD::AND:
+ if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+ return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
+ }
+ break;
+
+ case ISD::OR:
+ if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+ return (0x03020100 & ~ConstMask) | ConstMask;
+ }
+ break;
+
+ case ISD::SHL:
+ if (C % 8)
+ return ~0;
+
+ return uint32_t((0x030201000c0c0c0cull << C) >> 32);
+
+ case ISD::SRL:
+ if (C % 8)
+ return ~0;
+
+ return uint32_t(0x0c0c0c0c03020100ull >> C);
+ }
+
+ return ~0;
+}
+
SDValue SITargetLowering::performAndCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (DCI.isBeforeLegalize())
@@ -5435,6 +6338,20 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
}
}
}
+
+ // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+ if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
+ isa<ConstantSDNode>(LHS.getOperand(2))) {
+ uint32_t Sel = getConstantPermuteMask(Mask);
+ if (!Sel)
+ return SDValue();
+
+ // Select 0xc for all zero bytes
+ Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+ LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+ }
}
// (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
@@ -5487,6 +6404,54 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
}
+ // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+ N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+ uint32_t LHSMask = getPermuteMask(DAG, LHS);
+ uint32_t RHSMask = getPermuteMask(DAG, RHS);
+ if (LHSMask != ~0u && RHSMask != ~0u) {
+ // Canonicalize the expression in an attempt to have fewer unique masks
+ // and therefore fewer registers used to hold the masks.
+ if (LHSMask > RHSMask) {
+ std::swap(LHSMask, RHSMask);
+ std::swap(LHS, RHS);
+ }
+
+ // Select 0xc for each lane used from source operand. Zero has 0xc mask
+ // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+ uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+ uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+ // Check of we need to combine values from two sources within a byte.
+ if (!(LHSUsedLanes & RHSUsedLanes) &&
+ // If we select high and lower word keep it for SDWA.
+ // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+ !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+ // Each byte in each mask is either selector mask 0-3, or has higher
+ // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
+ // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
+ // mask which is not 0xff wins. By anding both masks we have a correct
+ // result except that 0x0c shall be corrected to give 0x0c only.
+ uint32_t Mask = LHSMask & RHSMask;
+ for (unsigned I = 0; I < 32; I += 8) {
+ uint32_t ByteSel = 0xff << I;
+ if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
+ Mask &= (0x0c << I) & 0xffffffff;
+ }
+
+ // Add 4 to each active LHS lane. It will not affect any existing 0xff
+ // or 0x0c.
+ uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
+ SDLoc DL(N);
+
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+ LHS.getOperand(0), RHS.getOperand(0),
+ DAG.getConstant(Sel, DL, MVT::i32));
+ }
+ }
+ }
+
return SDValue();
}
@@ -5522,6 +6487,60 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
return SDValue();
}
+ // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+ if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
+ LHS.getOpcode() == AMDGPUISD::PERM &&
+ isa<ConstantSDNode>(LHS.getOperand(2))) {
+ uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
+ if (!Sel)
+ return SDValue();
+
+ Sel |= LHS.getConstantOperandVal(2);
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+ LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+ }
+
+ // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+ N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+ uint32_t LHSMask = getPermuteMask(DAG, LHS);
+ uint32_t RHSMask = getPermuteMask(DAG, RHS);
+ if (LHSMask != ~0u && RHSMask != ~0u) {
+ // Canonicalize the expression in an attempt to have fewer unique masks
+ // and therefore fewer registers used to hold the masks.
+ if (LHSMask > RHSMask) {
+ std::swap(LHSMask, RHSMask);
+ std::swap(LHS, RHS);
+ }
+
+ // Select 0xc for each lane used from source operand. Zero has 0xc mask
+ // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+ uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+ uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+ // Check of we need to combine values from two sources within a byte.
+ if (!(LHSUsedLanes & RHSUsedLanes) &&
+ // If we select high and lower word keep it for SDWA.
+ // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+ !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+ // Kill zero bytes selected by other mask. Zero value is 0xc.
+ LHSMask &= ~RHSUsedLanes;
+ RHSMask &= ~LHSUsedLanes;
+ // Add 4 to each active LHS lane
+ LHSMask |= LHSUsedLanes & 0x04040404;
+ // Combine masks
+ uint32_t Sel = LHSMask | RHSMask;
+ SDLoc DL(N);
+
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+ LHS.getOperand(0), RHS.getOperand(0),
+ DAG.getConstant(Sel, DL, MVT::i32));
+ }
+ }
+ }
+
if (VT != MVT::i64)
return SDValue();
@@ -5628,6 +6647,7 @@ static bool fp16SrcZerosHighBits(unsigned Opc) {
case AMDGPUISD::FMAD_FTZ:
case AMDGPUISD::RCP:
case AMDGPUISD::RSQ:
+ case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::LDEXP:
return true;
default:
@@ -5680,6 +6700,23 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performRcpCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+
+ if (N0.isUndef())
+ return N0;
+
+ if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
+ N0.getOpcode() == ISD::SINT_TO_FP)) {
+ return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
+ N->getFlags());
+ }
+
+ return AMDGPUTargetLowering::performRcpCombine(N, DCI);
+}
+
static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
return true;
@@ -5688,7 +6725,7 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
}
static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
- const SISubtarget *ST, unsigned MaxDepth=5) {
+ const GCNSubtarget *ST, unsigned MaxDepth=5) {
// If source is a result of another standard FP operation it is already in
// canonical form.
@@ -5946,7 +6983,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
- VT != MVT::f64 &&
+ !VT.isVector() && VT != MVT::f64 &&
((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
// max(max(a, b), c) -> max3(a, b, c)
// min(min(a, b), c) -> min3(a, b, c)
@@ -6066,15 +7103,87 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
SDValue SITargetLowering::performExtractVectorEltCombine(
SDNode *N, DAGCombinerInfo &DCI) const {
SDValue Vec = N->getOperand(0);
-
SelectionDAG &DAG = DCI.DAG;
- if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
+
+ EVT VecVT = Vec.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+
+ if ((Vec.getOpcode() == ISD::FNEG ||
+ Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
SDLoc SL(N);
EVT EltVT = N->getValueType(0);
SDValue Idx = N->getOperand(1);
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
Vec.getOperand(0), Idx);
- return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
+ return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
+ }
+
+ // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
+ // =>
+ // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
+ // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
+ // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
+ if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
+ SDLoc SL(N);
+ EVT EltVT = N->getValueType(0);
+ SDValue Idx = N->getOperand(1);
+ unsigned Opc = Vec.getOpcode();
+
+ switch(Opc) {
+ default:
+ return SDValue();
+ // TODO: Support other binary operations.
+ case ISD::FADD:
+ case ISD::ADD:
+ case ISD::UMIN:
+ case ISD::UMAX:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ case ISD::FMAXNUM:
+ case ISD::FMINNUM:
+ return DAG.getNode(Opc, SL, EltVT,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec.getOperand(0), Idx),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec.getOperand(1), Idx));
+ }
+ }
+
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ unsigned VecSize = VecVT.getSizeInBits();
+ unsigned EltSize = EltVT.getSizeInBits();
+
+ // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
+ // elements. This exposes more load reduction opportunities by replacing
+ // multiple small extract_vector_elements with a single 32-bit extract.
+ auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (EltSize <= 16 &&
+ EltVT.isByteSized() &&
+ VecSize > 32 &&
+ VecSize % 32 == 0 &&
+ Idx) {
+ EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
+
+ unsigned BitIndex = Idx->getZExtValue() * EltSize;
+ unsigned EltIdx = BitIndex / 32;
+ unsigned LeftoverBitIdx = BitIndex % 32;
+ SDLoc SL(N);
+
+ SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
+ DCI.AddToWorklist(Cast.getNode());
+
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+ DCI.AddToWorklist(Elt.getNode());
+ SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
+ DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
+ DCI.AddToWorklist(Srl.getNode());
+
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
+ DCI.AddToWorklist(Trunc.getNode());
+ return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
}
return SDValue();
@@ -6135,8 +7244,8 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const TargetOptions &Options = DAG.getTarget().Options;
if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
- (N0->getFlags().hasUnsafeAlgebra() &&
- N1->getFlags().hasUnsafeAlgebra())) &&
+ (N0->getFlags().hasAllowContract() &&
+ N1->getFlags().hasAllowContract())) &&
isFMAFasterThanFMulAndFAdd(VT)) {
return ISD::FMA;
}
@@ -6192,7 +7301,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
return SDValue();
}
- if (VT != MVT::i32)
+ if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
return SDValue();
// add x, zext (setcc) => addcarry x, 0, setcc
@@ -6368,6 +7477,79 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performFMACombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDLoc SL(N);
+
+ if (!Subtarget->hasDLInsts() || VT != MVT::f32)
+ return SDValue();
+
+ // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
+ // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
+ SDValue Op1 = N->getOperand(0);
+ SDValue Op2 = N->getOperand(1);
+ SDValue FMA = N->getOperand(2);
+
+ if (FMA.getOpcode() != ISD::FMA ||
+ Op1.getOpcode() != ISD::FP_EXTEND ||
+ Op2.getOpcode() != ISD::FP_EXTEND)
+ return SDValue();
+
+ // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
+ // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
+ // is sufficient to allow generaing fdot2.
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+ (N->getFlags().hasAllowContract() &&
+ FMA->getFlags().hasAllowContract())) {
+ Op1 = Op1.getOperand(0);
+ Op2 = Op2.getOperand(0);
+ if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue Vec1 = Op1.getOperand(0);
+ SDValue Idx1 = Op1.getOperand(1);
+ SDValue Vec2 = Op2.getOperand(0);
+
+ SDValue FMAOp1 = FMA.getOperand(0);
+ SDValue FMAOp2 = FMA.getOperand(1);
+ SDValue FMAAcc = FMA.getOperand(2);
+
+ if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
+ FMAOp2.getOpcode() != ISD::FP_EXTEND)
+ return SDValue();
+
+ FMAOp1 = FMAOp1.getOperand(0);
+ FMAOp2 = FMAOp2.getOperand(0);
+ if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue Vec3 = FMAOp1.getOperand(0);
+ SDValue Vec4 = FMAOp2.getOperand(0);
+ SDValue Idx2 = FMAOp1.getOperand(1);
+
+ if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
+ // Idx1 and Idx2 cannot be the same.
+ Idx1 == Idx2)
+ return SDValue();
+
+ if (Vec1 == Vec2 || Vec3 == Vec4)
+ return SDValue();
+
+ if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
+ return SDValue();
+
+ if ((Vec1 == Vec3 && Vec2 == Vec4) ||
+ (Vec1 == Vec4 && Vec2 == Vec3))
+ return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc);
+ }
+ return SDValue();
+}
+
SDValue SITargetLowering::performSetCCCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -6387,23 +7569,49 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
}
}
- if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
- isBoolSGPR(LHS.getOperand(0))) {
- // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
- // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
- // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
- // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
- if ((CRHS->isAllOnesValue() &&
- (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
- (CRHS->isNullValue() &&
- (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
- return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
- DAG.getConstant(-1, SL, MVT::i1));
- if ((CRHS->isAllOnesValue() &&
- (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
- (CRHS->isNullValue() &&
- (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
- return LHS.getOperand(0);
+ if (CRHS) {
+ if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
+ isBoolSGPR(LHS.getOperand(0))) {
+ // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
+ // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
+ // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
+ // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
+ if ((CRHS->isAllOnesValue() &&
+ (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
+ (CRHS->isNullValue() &&
+ (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
+ return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
+ DAG.getConstant(-1, SL, MVT::i1));
+ if ((CRHS->isAllOnesValue() &&
+ (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
+ (CRHS->isNullValue() &&
+ (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
+ return LHS.getOperand(0);
+ }
+
+ uint64_t CRHSVal = CRHS->getZExtValue();
+ if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+ LHS.getOpcode() == ISD::SELECT &&
+ isa<ConstantSDNode>(LHS.getOperand(1)) &&
+ isa<ConstantSDNode>(LHS.getOperand(2)) &&
+ LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
+ isBoolSGPR(LHS.getOperand(0))) {
+ // Given CT != FT:
+ // setcc (select cc, CT, CF), CF, eq => xor cc, -1
+ // setcc (select cc, CT, CF), CF, ne => cc
+ // setcc (select cc, CT, CF), CT, ne => xor cc, -1
+ // setcc (select cc, CT, CF), CT, eq => cc
+ uint64_t CT = LHS.getConstantOperandVal(1);
+ uint64_t CF = LHS.getConstantOperandVal(2);
+
+ if ((CF == CRHSVal && CC == ISD::SETEQ) ||
+ (CT == CRHSVal && CC == ISD::SETNE))
+ return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
+ DAG.getConstant(-1, SL, MVT::i1));
+ if ((CF == CRHSVal && CC == ISD::SETNE) ||
+ (CT == CRHSVal && CC == ISD::SETEQ))
+ return LHS.getOperand(0);
+ }
}
if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
@@ -6472,6 +7680,29 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performClampCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+ if (!CSrc)
+ return SDValue();
+
+ const APFloat &F = CSrc->getValueAPF();
+ APFloat Zero = APFloat::getZero(F.getSemantics());
+ APFloat::cmpResult Cmp0 = F.compare(Zero);
+ if (Cmp0 == APFloat::cmpLessThan ||
+ (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
+ return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
+ }
+
+ APFloat One(F.getSemantics(), "1.0");
+ APFloat::cmpResult Cmp1 = F.compare(One);
+ if (Cmp1 == APFloat::cmpGreaterThan)
+ return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
+
+ return SDValue(CSrc, 0);
+}
+
+
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
@@ -6503,7 +7734,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performMinMaxCombine(N, DCI);
break;
}
- case ISD::LOAD:
+ case ISD::FMA:
+ return performFMACombine(N, DCI);
+ case ISD::LOAD: {
+ if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
+ return Widended;
+ LLVM_FALLTHROUGH;
+ }
case ISD::STORE:
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE:
@@ -6521,7 +7758,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:
case AMDGPUISD::ATOMIC_INC:
- case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics.
+ case AMDGPUISD::ATOMIC_DEC:
+ case AMDGPUISD::ATOMIC_LOAD_FADD:
+ case AMDGPUISD::ATOMIC_LOAD_FMIN:
+ case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
if (DCI.isBeforeLegalize())
break;
return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
@@ -6537,11 +7777,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performClassCombine(N, DCI);
case ISD::FCANONICALIZE:
return performFCanonicalizeCombine(N, DCI);
- case AMDGPUISD::FRACT:
case AMDGPUISD::RCP:
+ return performRcpCombine(N, DCI);
+ case AMDGPUISD::FRACT:
case AMDGPUISD::RSQ:
case AMDGPUISD::RCP_LEGACY:
case AMDGPUISD::RSQ_LEGACY:
+ case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::RSQ_CLAMP:
case AMDGPUISD::LDEXP: {
SDValue Src = N->getOperand(0);
@@ -6561,6 +7803,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performFMed3Combine(N, DCI);
case AMDGPUISD::CVT_PKRTZ_F16_F32:
return performCvtPkRTZCombine(N, DCI);
+ case AMDGPUISD::CLAMP:
+ return performClampCombine(N, DCI);
case ISD::SCALAR_TO_VECTOR: {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
@@ -6587,7 +7831,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
-/// \brief Helper function for adjustWritemask
+/// Helper function for adjustWritemask
static unsigned SubIdx2Lane(unsigned Idx) {
switch (Idx) {
default: return 0;
@@ -6598,12 +7842,19 @@ static unsigned SubIdx2Lane(unsigned Idx) {
}
}
-/// \brief Adjust the writemask of MIMG instructions
+/// Adjust the writemask of MIMG instructions
SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
SelectionDAG &DAG) const {
+ unsigned Opcode = Node->getMachineOpcode();
+
+ // Subtract 1 because the vdata output is not a MachineSDNode operand.
+ int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
+ if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
+ return Node; // not implemented for D16
+
SDNode *Users[4] = { nullptr };
unsigned Lane = 0;
- unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
+ unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
unsigned NewDmask = 0;
bool HasChain = Node->getNumValues() > 1;
@@ -6653,9 +7904,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
unsigned BitsSet = countPopulation(NewDmask);
- const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
- int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII,
- Node->getMachineOpcode(), BitsSet);
+ int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
assert(NewOpcode != -1 &&
NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
"failed to find equivalent MIMG op");
@@ -6720,7 +7969,7 @@ static bool isFrameIndexOp(SDValue Op) {
return isa<FrameIndexSDNode>(Op);
}
-/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
+/// Legalize target independent instructions (e.g. INSERT_SUBREG)
/// with frame index operands.
/// LLVM assumes that inputs are to these instructions are registers.
SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
@@ -6767,7 +8016,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
return DAG.UpdateNodeOperands(Node, Ops);
}
-/// \brief Fold the instructions after selecting them.
+/// Fold the instructions after selecting them.
/// Returns null if users were already updated.
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
@@ -6841,7 +8090,7 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
return Node;
}
-/// \brief Assign the register class depending on the number of
+/// Assign the register class depending on the number of
/// bits set in the writemask
void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
@@ -6928,7 +8177,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
}
-/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+/// Return a resource descriptor with the 'Add TID' bit enabled
/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
/// of the resource descriptor) to create an offset, which is added to
/// the resource pointer.
@@ -6970,11 +8219,11 @@ std::pair<unsigned, const TargetRegisterClass *>
SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
- if (!isTypeLegal(VT))
- return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
-
+ const TargetRegisterClass *RC = nullptr;
if (Constraint.size() == 1) {
switch (Constraint[0]) {
+ default:
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
case 's':
case 'r':
switch (VT.getSizeInBits()) {
@@ -6982,40 +8231,56 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, nullptr);
case 32:
case 16:
- return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
+ RC = &AMDGPU::SReg_32_XM0RegClass;
+ break;
case 64:
- return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+ RC = &AMDGPU::SGPR_64RegClass;
+ break;
case 128:
- return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+ RC = &AMDGPU::SReg_128RegClass;
+ break;
case 256:
- return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+ RC = &AMDGPU::SReg_256RegClass;
+ break;
case 512:
- return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
+ RC = &AMDGPU::SReg_512RegClass;
+ break;
}
-
+ break;
case 'v':
switch (VT.getSizeInBits()) {
default:
return std::make_pair(0U, nullptr);
case 32:
case 16:
- return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+ RC = &AMDGPU::VGPR_32RegClass;
+ break;
case 64:
- return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+ RC = &AMDGPU::VReg_64RegClass;
+ break;
case 96:
- return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+ RC = &AMDGPU::VReg_96RegClass;
+ break;
case 128:
- return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+ RC = &AMDGPU::VReg_128RegClass;
+ break;
case 256:
- return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+ RC = &AMDGPU::VReg_256RegClass;
+ break;
case 512:
- return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+ RC = &AMDGPU::VReg_512RegClass;
+ break;
}
+ break;
}
+ // We actually support i128, i16 and f16 as inline parameters
+ // even if they are not reported as legal
+ if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
+ VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
+ return std::make_pair(0U, RC);
}
if (Constraint.size() > 1) {
- const TargetRegisterClass *RC = nullptr;
if (Constraint[1] == 'v') {
RC = &AMDGPU::VGPR_32RegClass;
} else if (Constraint[1] == 's') {
@@ -7052,8 +8317,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const MachineFrameInfo &MFI = MF.getFrameInfo();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (Info->isEntryFunction()) {
// Callable functions have fixed registers used for stack access.
@@ -7083,6 +8347,8 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
Info->getScratchWaveOffsetReg());
+ Info->limitOccupancy(MF);
+
TargetLoweringBase::finalizeLowering(MF);
}
@@ -7103,3 +8369,69 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
// calculation won't overflow, so assume the sign bit is never set.
Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
}
+
+bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
+ FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const
+{
+ switch (N->getOpcode()) {
+ case ISD::Register:
+ case ISD::CopyFromReg:
+ {
+ const RegisterSDNode *R = nullptr;
+ if (N->getOpcode() == ISD::Register) {
+ R = dyn_cast<RegisterSDNode>(N);
+ }
+ else {
+ R = dyn_cast<RegisterSDNode>(N->getOperand(1));
+ }
+ if (R)
+ {
+ const MachineFunction * MF = FLI->MF;
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+ unsigned Reg = R->getReg();
+ if (TRI.isPhysicalRegister(Reg))
+ return TRI.isVGPR(MRI, Reg);
+
+ if (MRI.isLiveIn(Reg)) {
+ // workitem.id.x workitem.id.y workitem.id.z
+ // Any VGPR formal argument is also considered divergent
+ if (TRI.isVGPR(MRI, Reg))
+ return true;
+ // Formal arguments of non-entry functions
+ // are conservatively considered divergent
+ else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
+ return true;
+ }
+ return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg));
+ }
+ }
+ break;
+ case ISD::LOAD: {
+ const LoadSDNode *L = dyn_cast<LoadSDNode>(N);
+ if (L->getMemOperand()->getAddrSpace() ==
+ Subtarget->getAMDGPUAS().PRIVATE_ADDRESS)
+ return true;
+ } break;
+ case ISD::CALLSEQ_END:
+ return true;
+ break;
+ case ISD::INTRINSIC_WO_CHAIN:
+ {
+
+ }
+ return AMDGPU::isIntrinsicSourceOfDivergence(
+ cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
+ case ISD::INTRINSIC_W_CHAIN:
+ return AMDGPU::isIntrinsicSourceOfDivergence(
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+ // In some cases intrinsics that are a source of divergence have been
+ // lowered to AMDGPUISD so we also need to check those too.
+ case AMDGPUISD::INTERP_MOV:
+ case AMDGPUISD::INTERP_P1:
+ case AMDGPUISD::INTERP_P2:
+ return true;
+ }
+ return false;
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index b48e67f7563a..ad049f2a71c3 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief SI DAG Lowering interface definition
+/// SI DAG Lowering interface definition
//
//===----------------------------------------------------------------------===//
@@ -22,12 +22,15 @@
namespace llvm {
class SITargetLowering final : public AMDGPUTargetLowering {
+private:
+ const GCNSubtarget *Subtarget;
+
SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
SDValue Chain, uint64_t Offset) const;
SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
- uint64_t Offset, bool Signed,
+ uint64_t Offset, unsigned Align, bool Signed,
const ISD::InputArg *Arg = nullptr) const;
SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
@@ -42,10 +45,14 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SelectionDAG &DAG) const override;
SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
MVT VT, unsigned Offset) const;
+ SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
+ SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
@@ -60,7 +67,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
- /// \brief Converts \p Op, which must be of floating point type, to the
+ SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
+ SelectionDAG &DAG,
+ bool IsIntrinsic = false) const;
+
+ SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
+
+ /// Converts \p Op, which must be of floating point type, to the
/// floating point type \p VT, by either extending or truncating it.
SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,
SDValue Op,
@@ -71,7 +84,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,
bool Signed, const ISD::InputArg *Arg = nullptr) const;
- /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16.
+ /// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
@@ -80,7 +93,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;
SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
@@ -121,8 +136,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
@@ -145,9 +163,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool shouldEmitPCReloc(const GlobalValue *GV) const;
public:
- SITargetLowering(const TargetMachine &tm, const SISubtarget &STI);
+ SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
- const SISubtarget *getSubtarget() const;
+ const GCNSubtarget *getSubtarget() const;
+
+ bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override;
bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override;
@@ -255,7 +275,10 @@ public:
EVT VT) const override;
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+ SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
+ SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
@@ -284,6 +307,9 @@ public:
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
+
+ bool isSDNodeSourceOfDivergence(const SDNode *N,
+ FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp
index a2f844d7854e..61c8f359e168 100644
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This pass inserts branches on the 0 exec mask over divergent branches
+/// This pass inserts branches on the 0 exec mask over divergent branches
/// branches when it's expected that jumping over the untaken control flow will
/// be cheaper than having every workitem no-op through it.
//
@@ -18,6 +18,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -210,65 +211,73 @@ void SIInsertSkips::kill(MachineInstr &MI) {
switch (MI.getOperand(2).getImm()) {
case ISD::SETOEQ:
case ISD::SETEQ:
- Opcode = AMDGPU::V_CMPX_EQ_F32_e32;
+ Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
break;
case ISD::SETOGT:
case ISD::SETGT:
- Opcode = AMDGPU::V_CMPX_LT_F32_e32;
+ Opcode = AMDGPU::V_CMPX_LT_F32_e64;
break;
case ISD::SETOGE:
case ISD::SETGE:
- Opcode = AMDGPU::V_CMPX_LE_F32_e32;
+ Opcode = AMDGPU::V_CMPX_LE_F32_e64;
break;
case ISD::SETOLT:
case ISD::SETLT:
- Opcode = AMDGPU::V_CMPX_GT_F32_e32;
+ Opcode = AMDGPU::V_CMPX_GT_F32_e64;
break;
case ISD::SETOLE:
case ISD::SETLE:
- Opcode = AMDGPU::V_CMPX_GE_F32_e32;
+ Opcode = AMDGPU::V_CMPX_GE_F32_e64;
break;
case ISD::SETONE:
case ISD::SETNE:
- Opcode = AMDGPU::V_CMPX_LG_F32_e32;
+ Opcode = AMDGPU::V_CMPX_LG_F32_e64;
break;
case ISD::SETO:
- Opcode = AMDGPU::V_CMPX_O_F32_e32;
+ Opcode = AMDGPU::V_CMPX_O_F32_e64;
break;
case ISD::SETUO:
- Opcode = AMDGPU::V_CMPX_U_F32_e32;
+ Opcode = AMDGPU::V_CMPX_U_F32_e64;
break;
case ISD::SETUEQ:
- Opcode = AMDGPU::V_CMPX_NLG_F32_e32;
+ Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
break;
case ISD::SETUGT:
- Opcode = AMDGPU::V_CMPX_NGE_F32_e32;
+ Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
break;
case ISD::SETUGE:
- Opcode = AMDGPU::V_CMPX_NGT_F32_e32;
+ Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
break;
case ISD::SETULT:
- Opcode = AMDGPU::V_CMPX_NLE_F32_e32;
+ Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
break;
case ISD::SETULE:
- Opcode = AMDGPU::V_CMPX_NLT_F32_e32;
+ Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
break;
case ISD::SETUNE:
- Opcode = AMDGPU::V_CMPX_NEQ_F32_e32;
+ Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
break;
default:
llvm_unreachable("invalid ISD:SET cond code");
}
- // TODO: Allow this:
- if (!MI.getOperand(0).isReg() ||
- !TRI->isVGPR(MBB.getParent()->getRegInfo(),
- MI.getOperand(0).getReg()))
- llvm_unreachable("SI_KILL operand should be a VGPR");
-
- BuildMI(MBB, &MI, DL, TII->get(Opcode))
- .add(MI.getOperand(1))
- .add(MI.getOperand(0));
+ assert(MI.getOperand(0).isReg());
+
+ if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
+ MI.getOperand(0).getReg())) {
+ Opcode = AMDGPU::getVOPe32(Opcode);
+ BuildMI(MBB, &MI, DL, TII->get(Opcode))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(0));
+ } else {
+ BuildMI(MBB, &MI, DL, TII->get(Opcode))
+ .addReg(AMDGPU::VCC, RegState::Define)
+ .addImm(0) // src0 modifiers
+ .add(MI.getOperand(1))
+ .addImm(0) // src1 modifiers
+ .add(MI.getOperand(0))
+ .addImm(0); // omod
+ }
break;
}
case AMDGPU::SI_KILL_I1_TERMINATOR: {
@@ -330,7 +339,7 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
}
bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
SkipThreshold = SkipThresholdFlag;
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6bbe5979316d..d456e3d9b94d 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Insert wait instructions for memory reads and writes.
+/// Insert wait instructions for memory reads and writes.
///
/// Memory reads and writes are issued asynchronously, so we need to insert
/// S_WAITCNT instructions when we want to access any of their results or
@@ -40,6 +40,7 @@
#include "llvm/IR/DebugLoc.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
@@ -50,9 +51,21 @@
#include <utility>
#include <vector>
+using namespace llvm;
+
#define DEBUG_TYPE "si-insert-waitcnts"
-using namespace llvm;
+DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
+ "Force emit s_waitcnt expcnt(0) instrs");
+DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
+ "Force emit s_waitcnt lgkmcnt(0) instrs");
+DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
+ "Force emit s_waitcnt vmcnt(0) instrs");
+
+static cl::opt<unsigned> ForceEmitZeroFlag(
+ "amdgpu-waitcnt-forcezero",
+ cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
+ cl::init(0), cl::Hidden);
namespace {
@@ -115,15 +128,15 @@ enum RegisterMapping {
(w) = (enum WaitEventType)((w) + 1))
// This is a per-basic-block object that maintains current score brackets
-// of each wait-counter, and a per-register scoreboard for each wait-couner.
+// of each wait counter, and a per-register scoreboard for each wait counter.
// We also maintain the latest score for every event type that can change the
// waitcnt in order to know if there are multiple types of events within
// the brackets. When multiple types of event happen in the bracket,
-// wait-count may get decreased out of order, therefore we need to put in
+// wait count may get decreased out of order, therefore we need to put in
// "s_waitcnt 0" before use.
class BlockWaitcntBrackets {
public:
- BlockWaitcntBrackets() {
+ BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {
memset(VgprScores[T], 0, sizeof(VgprScores[T]));
@@ -301,6 +314,7 @@ public:
void dump() { print(dbgs()); }
private:
+ const GCNSubtarget *ST = nullptr;
bool WaitAtBeginning = false;
bool RevisitLoop = false;
bool MixedExpTypes = false;
@@ -332,14 +346,12 @@ public:
void incIterCnt() { IterCnt++; }
void resetIterCnt() { IterCnt = 0; }
- int32_t getIterCnt() { return IterCnt; }
+ unsigned getIterCnt() { return IterCnt; }
void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
MachineInstr *getWaitcnt() const { return LfWaitcnt; }
- void print() {
- DEBUG(dbgs() << " iteration " << IterCnt << '\n';);
- }
+ void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }
private:
// s_waitcnt added at the end of loop footer to stablize wait scores
@@ -352,7 +364,7 @@ private:
class SIInsertWaitcnts : public MachineFunctionPass {
private:
- const SISubtarget *ST = nullptr;
+ const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
const MachineRegisterInfo *MRI = nullptr;
@@ -361,22 +373,31 @@ private:
AMDGPUAS AMDGPUASI;
DenseSet<MachineBasicBlock *> BlockVisitedSet;
- DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
+ DenseSet<MachineInstr *> TrackedWaitcntSet;
DenseSet<MachineInstr *> VCCZBugHandledSet;
DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
BlockWaitcntBracketsMap;
- DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
+ std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
+ // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
+ // because of amdgpu-waitcnt-forcezero flag
+ bool ForceEmitZeroWaitcnts;
+ bool ForceEmitWaitcnt[NUM_INST_CNTS];
+
public:
static char ID;
- SIInsertWaitcnts() : MachineFunctionPass(ID) {}
+ SIInsertWaitcnts() : MachineFunctionPass(ID) {
+ (void)ForceExpCounter;
+ (void)ForceLgkmCounter;
+ (void)ForceVMCounter;
+ }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -397,15 +418,53 @@ public:
llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
}
+ bool isForceEmitWaitcnt() const {
+ for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+ T = (enum InstCounterType)(T + 1))
+ if (ForceEmitWaitcnt[T])
+ return true;
+ return false;
+ }
+
+ void setForceEmitWaitcnt() {
+// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
+// For debug builds, get the debug counter info and adjust if need be
+#ifndef NDEBUG
+ if (DebugCounter::isCounterSet(ForceExpCounter) &&
+ DebugCounter::shouldExecute(ForceExpCounter)) {
+ ForceEmitWaitcnt[EXP_CNT] = true;
+ } else {
+ ForceEmitWaitcnt[EXP_CNT] = false;
+ }
+
+ if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
+ DebugCounter::shouldExecute(ForceLgkmCounter)) {
+ ForceEmitWaitcnt[LGKM_CNT] = true;
+ } else {
+ ForceEmitWaitcnt[LGKM_CNT] = false;
+ }
+
+ if (DebugCounter::isCounterSet(ForceVMCounter) &&
+ DebugCounter::shouldExecute(ForceVMCounter)) {
+ ForceEmitWaitcnt[VM_CNT] = true;
+ } else {
+ ForceEmitWaitcnt[VM_CNT] = false;
+ }
+#endif // NDEBUG
+ }
+
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
- MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
- BlockWaitcntBrackets *ScoreBrackets);
- void updateEventWaitCntAfter(MachineInstr &Inst,
+ void generateWaitcntInstBefore(MachineInstr &MI,
+ BlockWaitcntBrackets *ScoreBrackets);
+ void updateEventWaitcntAfter(MachineInstr &Inst,
BlockWaitcntBrackets *ScoreBrackets);
void mergeInputScoreBrackets(MachineBasicBlock &Block);
- MachineBasicBlock *loopBottom(const MachineLoop *Loop);
+ bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
+ unsigned countNumBottomBlocks(const MachineLoop *Loop);
void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
+ bool isWaitcntStronger(unsigned LHS, unsigned RHS);
+ unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
};
} // end anonymous namespace
@@ -459,7 +518,7 @@ void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
const MachineRegisterInfo *MRI,
unsigned OpNo, int32_t Val) {
RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
- DEBUG({
+ LLVM_DEBUG({
const MachineOperand &Opnd = MI->getOperand(OpNo);
assert(TRI->isVGPR(*MRI, Opnd.getReg()));
});
@@ -681,14 +740,17 @@ unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
const int32_t LB = getScoreLB(T);
const int32_t UB = getScoreUB(T);
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
- if (T == VM_CNT && hasPendingFlat()) {
- // If there is a pending FLAT operation, and this is a VM waitcnt,
- // then we need to force a waitcnt 0 for VM.
+ if ((T == VM_CNT || T == LGKM_CNT) &&
+ hasPendingFlat() &&
+ !ST->hasFlatLgkmVMemCountInOrder()) {
+ // If there is a pending FLAT operation, and this is a VMem or LGKM
+ // waitcnt and the target can report early completion, then we need
+ // to force a waitcnt 0.
NeedWait = CNT_MASK(T);
setScoreLB(T, getScoreUB(T));
} else if (counterOutOfOrder(T)) {
// Counter can get decremented out-of-order when there
- // are multiple types event in the brack. Also emit an s_wait counter
+ // are multiple types event in the bracket. Also emit an s_wait counter
// with a conservative value of 0 for the counter.
NeedWait = CNT_MASK(T);
setScoreLB(T, getScoreUB(T));
@@ -789,7 +851,30 @@ static bool readsVCCZ(const MachineInstr &MI) {
!MI.getOperand(1).isUndef();
}
-/// \brief Generate s_waitcnt instruction to be placed before cur_Inst.
+/// Given wait count encodings checks if LHS is stronger than RHS.
+bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
+ if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
+ return false;
+ if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
+ return false;
+ if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
+ return false;
+ return true;
+}
+
+/// Given wait count encodings create a new encoding which is stronger
+/// or equal to both.
+unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
+ unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
+ AMDGPU::decodeVmcnt(IV, RHS));
+ unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
+ AMDGPU::decodeLgkmcnt(IV, RHS));
+ unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
+ AMDGPU::decodeExpcnt(IV, RHS));
+ return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
+}
+
+/// Generate s_waitcnt instruction to be placed before cur_Inst.
/// Instructions of a given type are returned in order,
/// but instructions of different types can complete out of order.
/// We rely on this in-order completion
@@ -799,23 +884,29 @@ static bool readsVCCZ(const MachineInstr &MI) {
/// and if so what the value of each counter is.
/// The "score bracket" is bound by the lower bound and upper bound
/// scores (*_score_LB and *_score_ub respectively).
-MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
+void SIInsertWaitcnts::generateWaitcntInstBefore(
MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
// To emit, or not to emit - that's the question!
// Start with an assumption that there is no need to emit.
- unsigned int EmitSwaitcnt = 0;
- // s_waitcnt instruction to return; default is NULL.
- MachineInstr *SWaitInst = nullptr;
+ unsigned int EmitWaitcnt = 0;
+
// No need to wait before phi. If a phi-move exists, then the wait should
// has been inserted before the move. If a phi-move does not exist, then
// wait should be inserted before the real use. The same is true for
// sc-merge. It is not a coincident that all these cases correspond to the
// instructions that are skipped in the assembling loop.
bool NeedLineMapping = false; // TODO: Check on this.
- if (MI.isDebugValue() &&
+
+ // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
+ bool ForceEmitZeroWaitcnt = false;
+
+ setForceEmitWaitcnt();
+ bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
+
+ if (MI.isDebugInstr() &&
// TODO: any other opcode?
!NeedLineMapping) {
- return SWaitInst;
+ return;
}
// See if an s_waitcnt is forced at block entry, or is needed at
@@ -826,7 +917,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
ScoreBrackets->clearWaitAtBeginning();
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {
- EmitSwaitcnt |= CNT_MASK(T);
+ EmitWaitcnt |= CNT_MASK(T);
ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
}
}
@@ -836,21 +927,20 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
- EmitSwaitcnt |=
+ EmitWaitcnt |=
ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
}
// All waits must be resolved at call return.
// NOTE: this could be improved with knowledge of all call sites or
// with knowledge of the called routines.
- if (MI.getOpcode() == AMDGPU::RETURN ||
- MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+ if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {
if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
- EmitSwaitcnt |= CNT_MASK(T);
+ EmitWaitcnt |= CNT_MASK(T);
}
}
}
@@ -861,7 +951,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
AMDGPU::SendMsg::ID_GS_DONE)) {
if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
- EmitSwaitcnt |= CNT_MASK(VM_CNT);
+ EmitWaitcnt |= CNT_MASK(VM_CNT);
}
}
#if 0 // TODO: the following blocks of logic when we have fence.
@@ -879,11 +969,11 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
case SCMEM_LDS:
if (group_is_multi_wave ||
context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
- EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+ EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
ScoreBrackets->getScoreUB(LGKM_CNT));
// LDS may have to wait for VM_CNT after buffer load to LDS
if (target_info->HasBufferLoadToLDS()) {
- EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+ EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
ScoreBrackets->getScoreUB(VM_CNT));
}
}
@@ -891,9 +981,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
case SCMEM_GDS:
if (group_is_multi_wave || fence_is_global) {
- EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+ EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
ScoreBrackets->getScoreUB(EXP_CNT));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+ EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
ScoreBrackets->getScoreUB(LGKM_CNT));
}
break;
@@ -903,9 +993,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
case SCMEM_RING:
case SCMEM_SCATTER:
if (group_is_multi_wave || fence_is_global) {
- EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+ EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
ScoreBrackets->getScoreUB(EXP_CNT));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+ EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
ScoreBrackets->getScoreUB(VM_CNT));
}
break;
@@ -926,13 +1016,13 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
// Export and GDS are tracked individually, either may trigger a waitcnt
// for EXEC.
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
}
@@ -947,7 +1037,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
if (ScoreBrackets->getScoreUB(EXP_CNT) >
ScoreBrackets->getScoreLB(EXP_CNT)) {
ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
- EmitSwaitcnt |= CNT_MASK(EXP_CNT);
+ EmitWaitcnt |= CNT_MASK(EXP_CNT);
}
}
#endif
@@ -965,7 +1055,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
// VM_CNT is only relevant to vgpr or LDS.
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
}
@@ -977,10 +1067,10 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
if (TRI->isVGPR(MRIA, Op.getReg())) {
// VM_CNT is only relevant to vgpr or LDS.
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
}
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
}
}
@@ -999,9 +1089,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
if (AS != AMDGPUASI.LOCAL_ADDRESS)
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
}
}
@@ -1012,38 +1102,35 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
if (TRI->isVGPR(MRIA, Def.getReg())) {
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
}
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
}
} // End of for loop that looks at all dest operands.
}
- // TODO: Tie force zero to a compiler triage option.
- bool ForceZero = false;
-
// Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
// occurs before the instruction. Doing it here prevents any additional
// S_WAITCNTs from being emitted if the instruction was marked as
// requiring a WAITCNT beforehand.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier()) {
- EmitSwaitcnt |=
+ EmitWaitcnt |=
ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
- EmitSwaitcnt |= ScoreBrackets->updateByWait(
+ EmitWaitcnt |= ScoreBrackets->updateByWait(
LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
}
// TODO: Remove this work-around, enable the assert for Bug 457939
// after fixing the scheduler. Also, the Shader Compiler code is
// independent of target.
- if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
+ if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
if (ScoreBrackets->getScoreLB(LGKM_CNT) <
ScoreBrackets->getScoreUB(LGKM_CNT) &&
ScoreBrackets->hasPendingSMEM()) {
@@ -1052,17 +1139,20 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
// block, so if we only wait on LGKM here, we might end up with
// another s_waitcnt inserted right after this if there are non-LGKM
// instructions still outstanding.
- ForceZero = true;
- EmitSwaitcnt = true;
+ // FIXME: this is too conservative / the comment is wrong.
+ // We don't wait on everything at the end of the block and we combine
+ // waitcnts so we should never have back-to-back waitcnts.
+ ForceEmitZeroWaitcnt = true;
+ EmitWaitcnt = true;
}
}
// Does this operand processing indicate s_wait counter update?
- if (EmitSwaitcnt) {
+ if (EmitWaitcnt || IsForceEmitWaitcnt) {
int CntVal[NUM_INST_CNTS];
bool UseDefaultWaitcntStrategy = true;
- if (ForceZero) {
+ if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
// Force all waitcnts to 0.
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {
@@ -1077,7 +1167,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
if (UseDefaultWaitcntStrategy) {
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {
- if (EmitSwaitcnt & CNT_MASK(T)) {
+ if (EmitWaitcnt & CNT_MASK(T)) {
int Delta =
ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
int MaxDelta = ScoreBrackets->getWaitCountMax(T);
@@ -1087,7 +1177,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
ScoreBrackets->setScoreLB(
T, ScoreBrackets->getScoreUB(T) - MaxDelta);
}
- EmitSwaitcnt &= ~CNT_MASK(T);
+ EmitWaitcnt &= ~CNT_MASK(T);
}
CntVal[T] = Delta;
} else {
@@ -1099,10 +1189,11 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
}
// If we are not waiting on any counter we can skip the wait altogether.
- if (EmitSwaitcnt != 0) {
+ if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
- if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
+ if (!OldWaitcnt ||
+ (AMDGPU::decodeVmcnt(IV, Imm) !=
(CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
(AMDGPU::decodeExpcnt(IV, Imm) !=
(CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
@@ -1114,39 +1205,80 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
BlockWaitcntBrackets *ScoreBracket =
BlockWaitcntBracketsMap[TBB].get();
if (!ScoreBracket) {
- assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
+ assert(!BlockVisitedSet.count(TBB));
BlockWaitcntBracketsMap[TBB] =
- llvm::make_unique<BlockWaitcntBrackets>();
+ llvm::make_unique<BlockWaitcntBrackets>(ST);
ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
}
ScoreBracket->setRevisitLoop(true);
- DEBUG(dbgs() << "set-revisit: block"
- << ContainingLoop->getHeader()->getNumber() << '\n';);
+ LLVM_DEBUG(dbgs()
+ << "set-revisit2: Block"
+ << ContainingLoop->getHeader()->getNumber() << '\n';);
}
}
// Update an existing waitcount, or make a new one.
- MachineFunction &MF = *MI.getParent()->getParent();
- if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
- SWaitInst = OldWaitcnt;
- } else {
- SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
- MI.getDebugLoc());
- CompilerGeneratedWaitcntSet.insert(SWaitInst);
- }
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV,
+ ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
+ ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
+ ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
+ // We don't remove waitcnts that existed prior to the waitcnt
+ // pass. Check if the waitcnt to-be-inserted can be avoided
+ // or if the prev waitcnt can be updated.
+ bool insertSWaitInst = true;
+ for (MachineBasicBlock::iterator I = MI.getIterator(),
+ B = MI.getParent()->begin();
+ insertSWaitInst && I != B; --I) {
+ if (I == MI.getIterator())
+ continue;
- const MachineOperand &Op =
- MachineOperand::CreateImm(AMDGPU::encodeWaitcnt(
- IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
- SWaitInst->addOperand(MF, Op);
+ switch (I->getOpcode()) {
+ case AMDGPU::S_WAITCNT:
+ if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
+ insertSWaitInst = false;
+ else if (!OldWaitcnt) {
+ OldWaitcnt = &*I;
+ Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
+ }
+ break;
+ // TODO: skip over instructions which never require wait.
+ }
+ break;
+ }
+ if (insertSWaitInst) {
+ if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
+ if (ForceEmitZeroWaitcnts)
+ LLVM_DEBUG(
+ dbgs()
+ << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
+ if (IsForceEmitWaitcnt)
+ LLVM_DEBUG(dbgs()
+ << "Force emit a s_waitcnt due to debug counter\n");
+
+ OldWaitcnt->getOperand(0).setImm(Enc);
+ if (!OldWaitcnt->getParent())
+ MI.getParent()->insert(MI, OldWaitcnt);
+
+ LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+ << "Old Instr: " << MI << '\n'
+ << "New Instr: " << *OldWaitcnt << '\n');
+ } else {
+ auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
+ MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+ .addImm(Enc);
+ TrackedWaitcntSet.insert(SWaitInst);
+
+ LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+ << "Old Instr: " << MI << '\n'
+ << "New Instr: " << *SWaitInst << '\n');
+ }
+ }
if (CntVal[EXP_CNT] == 0) {
ScoreBrackets->setMixedExpTypes(false);
}
}
}
-
- return SWaitInst;
}
void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
@@ -1180,7 +1312,7 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
return false;
}
-void SIInsertWaitcnts::updateEventWaitCntAfter(
+void SIInsertWaitcnts::updateEventWaitcntAfter(
MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
// Now look at the instruction opcode. If it is a memory access
// instruction, update the upper-bound of the appropriate counter's
@@ -1214,7 +1346,7 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
- if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
+ if (ST->vmemWriteNeedsExpWaitcnt() &&
(Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
}
@@ -1247,27 +1379,37 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
}
}
+// Merge the score brackets of the Block's predecessors;
+// this merged score bracket is used when adding waitcnts to the Block
void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
int32_t MaxPending[NUM_INST_CNTS] = {0};
int32_t MaxFlat[NUM_INST_CNTS] = {0};
bool MixedExpTypes = false;
- // Clear the score bracket state.
- ScoreBrackets->clear();
-
- // Compute the number of pending elements on block entry.
+ // For single basic block loops, we need to retain the Block's
+ // score bracket to have accurate Pred info. So, make a copy of Block's
+ // score bracket, clear() it (which retains several important bits of info),
+ // populate, and then replace en masse. For non-single basic block loops,
+ // just clear Block's current score bracket and repopulate in-place.
+ bool IsSelfPred;
+ std::unique_ptr<BlockWaitcntBrackets> S;
+
+ IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
+ != Block.pred_end();
+ if (IsSelfPred) {
+ S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+ ScoreBrackets = S.get();
+ }
- // IMPORTANT NOTE: If iterative handling of loops is added, the code will
- // need to handle single BBs with backedges to themselves. This means that
- // they will need to retain and not clear their initial state.
+ ScoreBrackets->clear();
// See if there are any uninitialized predecessors. If so, emit an
// s_waitcnt 0 at the beginning of the block.
- for (MachineBasicBlock *pred : Block.predecessors()) {
+ for (MachineBasicBlock *Pred : Block.predecessors()) {
BlockWaitcntBrackets *PredScoreBrackets =
- BlockWaitcntBracketsMap[pred].get();
- bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
+ BlockWaitcntBracketsMap[Pred].get();
+ bool Visited = BlockVisitedSet.count(Pred);
if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
continue;
}
@@ -1306,7 +1448,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
for (MachineBasicBlock *Pred : Block.predecessors()) {
BlockWaitcntBrackets *PredScoreBrackets =
BlockWaitcntBracketsMap[Pred].get();
- bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
+ bool Visited = BlockVisitedSet.count(Pred);
if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
continue;
}
@@ -1354,7 +1496,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
// Set the register scoreboard.
for (MachineBasicBlock *Pred : Block.predecessors()) {
- if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+ if (!BlockVisitedSet.count(Pred)) {
continue;
}
@@ -1468,7 +1610,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
// sequencing predecessors, because changes to EXEC require waitcnts due to
// the delayed nature of these operations.
for (MachineBasicBlock *Pred : Block.predecessors()) {
- if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+ if (!BlockVisitedSet.count(Pred)) {
continue;
}
@@ -1496,17 +1638,36 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
}
}
}
+
+ // if a single block loop, update the score brackets. Not needed for other
+ // blocks, as we did this in-place
+ if (IsSelfPred) {
+ BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+ }
}
-/// Return the "bottom" block of a loop. This differs from
-/// MachineLoop::getBottomBlock in that it works even if the loop is
-/// discontiguous.
-MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
- MachineBasicBlock *Bottom = Loop->getHeader();
- for (MachineBasicBlock *MBB : Loop->blocks())
- if (MBB->getNumber() > Bottom->getNumber())
- Bottom = MBB;
- return Bottom;
+/// Return true if the given basic block is a "bottom" block of a loop.
+/// This works even if the loop is discontiguous. This also handles
+/// multiple back-edges for the same "header" block of a loop.
+bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
+ const MachineBasicBlock *Block) {
+ for (MachineBasicBlock *MBB : Loop->blocks()) {
+ if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/// Count the number of "bottom" basic blocks of a loop.
+unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
+ unsigned Count = 0;
+ for (MachineBasicBlock *MBB : Loop->blocks()) {
+ if (MBB->isSuccessor(Loop->getHeader())) {
+ Count++;
+ }
+ }
+ return Count;
}
// Generate s_waitcnt instructions where needed.
@@ -1517,8 +1678,8 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
- DEBUG({
- dbgs() << "Block" << Block.getNumber();
+ LLVM_DEBUG({
+ dbgs() << "*** Block" << Block.getNumber() << " ***";
ScoreBrackets->dump();
});
@@ -1528,16 +1689,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineInstr &Inst = *Iter;
// Remove any previously existing waitcnts.
if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
- // TODO: Register the old waitcnt and optimize the following waitcnts.
- // Leaving the previously existing waitcnts is conservatively correct.
- if (CompilerGeneratedWaitcntSet.find(&Inst) ==
- CompilerGeneratedWaitcntSet.end())
+ // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
+ // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
+ // as needed.
+ if (!TrackedWaitcntSet.count(&Inst))
++Iter;
else {
- ScoreBrackets->setWaitcnt(&Inst);
++Iter;
Inst.removeFromParent();
}
+ ScoreBrackets->setWaitcnt(&Inst);
continue;
}
@@ -1550,29 +1711,20 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
bool VCCZBugWorkAround = false;
if (readsVCCZ(Inst) &&
- (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
+ (!VCCZBugHandledSet.count(&Inst))) {
if (ScoreBrackets->getScoreLB(LGKM_CNT) <
ScoreBrackets->getScoreUB(LGKM_CNT) &&
ScoreBrackets->hasPendingSMEM()) {
- if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
+ if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
VCCZBugWorkAround = true;
}
}
// Generate an s_waitcnt instruction to be placed before
// cur_Inst, if needed.
- MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
-
- if (SWaitInst) {
- Block.insert(Inst, SWaitInst);
- if (ScoreBrackets->getWaitcnt() != SWaitInst) {
- DEBUG(dbgs() << "insertWaitcntInBlock\n"
- << "Old Instr: " << Inst << '\n'
- << "New Instr: " << *SWaitInst << '\n';);
- }
- }
+ generateWaitcntInstBefore(Inst, ScoreBrackets);
- updateEventWaitCntAfter(Inst, ScoreBrackets);
+ updateEventWaitcntAfter(Inst, ScoreBrackets);
#if 0 // TODO: implement resource type check controlled by options with ub = LB.
// If this instruction generates a S_SETVSKIP because it is an
@@ -1587,10 +1739,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
ScoreBrackets->clearWaitcnt();
- if (SWaitInst) {
- DEBUG({ SWaitInst->print(dbgs() << '\n'); });
- }
- DEBUG({
+ LLVM_DEBUG({
Inst.print(dbgs());
ScoreBrackets->dump();
});
@@ -1627,21 +1776,22 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Check if we need to force convergence at loop footer.
MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
- if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
+ if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
WaitcntData->print();
- DEBUG(dbgs() << '\n';);
+ LLVM_DEBUG(dbgs() << '\n';);
// The iterative waitcnt insertion algorithm aims for optimal waitcnt
- // placement and doesn't always guarantee convergence for a loop. Each
- // loop should take at most 2 iterations for it to converge naturally.
- // When this max is reached and result doesn't converge, we force
- // convergence by inserting a s_waitcnt at the end of loop footer.
- if (WaitcntData->getIterCnt() > 2) {
+ // placement, but doesn't guarantee convergence for a loop. Each
+ // loop should take at most (n+1) iterations for it to converge naturally,
+ // where n is the number of bottom blocks. If this threshold is reached and
+ // the result hasn't converged, then we force convergence by inserting
+ // a s_waitcnt at the end of loop footer.
+ if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
// To ensure convergence, need to make wait events at loop footer be no
// more than those from the previous iteration.
- // As a simplification, Instead of tracking individual scores and
- // generate the precise wait count, just wait on 0.
+ // As a simplification, instead of tracking individual scores and
+ // generating the precise wait count, just wait on 0.
bool HasPending = false;
MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
@@ -1649,16 +1799,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
HasPending = true;
+ break;
}
}
if (HasPending) {
if (!SWaitInst) {
- SWaitInst = Block.getParent()->CreateMachineInstr(
- TII->get(AMDGPU::S_WAITCNT), DebugLoc());
- CompilerGeneratedWaitcntSet.insert(SWaitInst);
- const MachineOperand &Op = MachineOperand::CreateImm(0);
- SWaitInst->addOperand(MF, Op);
+ SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
+ DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+ .addImm(0);
+ TrackedWaitcntSet.insert(SWaitInst);
#if 0 // TODO: Format the debug output
OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
OutputTransformAdd(SWaitInst, context);
@@ -1670,7 +1820,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
}
if (SWaitInst) {
- DEBUG({
+ LLVM_DEBUG({
SWaitInst->print(dbgs());
dbgs() << "\nAdjusted score board:";
ScoreBrackets->dump();
@@ -1678,7 +1828,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Add this waitcnt to the block. It is either newly created or
// created in previous iterations and added back since block traversal
- // always remove waitcnt.
+ // always removes waitcnts.
insertWaitcntBeforeCF(Block, SWaitInst);
WaitcntData->setWaitcnt(SWaitInst);
}
@@ -1687,7 +1837,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
}
bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
- ST = &MF.getSubtarget<SISubtarget>();
+ ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
@@ -1696,6 +1846,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
AMDGPUASI = ST->getAMDGPUAS();
+ ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
+ for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+ T = (enum InstCounterType)(T + 1))
+ ForceEmitWaitcnt[T] = false;
+
HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
@@ -1712,6 +1867,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
RegisterEncoding.SGPRL =
RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
+ TrackedWaitcntSet.clear();
+ BlockVisitedSet.clear();
+ VCCZBugHandledSet.clear();
+ LoopWaitcntDataMap.clear();
+ BlockWaitcntProcessedSet.clear();
+
// Walk over the blocks in reverse post-dominator order, inserting
// s_waitcnt where needed.
ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
@@ -1726,7 +1887,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
if (!ScoreBrackets) {
- BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
+ BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
}
ScoreBrackets->setPostOrder(MBB.getNumber());
@@ -1737,22 +1898,30 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
// If we are walking into the block from before the loop, then guarantee
// at least 1 re-walk over the loop to propagate the information, even if
// no S_WAITCNT instructions were generated.
- if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
- (BlockWaitcntProcessedSet.find(&MBB) ==
- BlockWaitcntProcessedSet.end())) {
- BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
- DEBUG(dbgs() << "set-revisit: block"
- << ContainingLoop->getHeader()->getNumber() << '\n';);
+ if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
+ unsigned Count = countNumBottomBlocks(ContainingLoop);
+
+ // If the loop has multiple back-edges, and so more than one "bottom"
+ // basic block, we have to guarantee a re-walk over every blocks.
+ if ((std::count(BlockWaitcntProcessedSet.begin(),
+ BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
+ BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
+ LLVM_DEBUG(dbgs() << "set-revisit1: Block"
+ << ContainingLoop->getHeader()->getNumber() << '\n';);
+ }
}
// Walk over the instructions.
insertWaitcntInBlock(MF, MBB);
- // Flag that waitcnts have been processed at least once.
- BlockWaitcntProcessedSet.insert(&MBB);
+ // Record that waitcnts have been processed at least once for this block.
+ BlockWaitcntProcessedSet.push_back(&MBB);
- // See if we want to revisit the loop.
- if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
+ // See if we want to revisit the loop. If a loop has multiple back-edges,
+ // we shouldn't revisit the same "bottom" basic block.
+ if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
+ std::count(BlockWaitcntProcessedSet.begin(),
+ BlockWaitcntProcessedSet.end(), &MBB) == 1) {
MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
if (EntrySB && EntrySB->getRevisitLoop()) {
@@ -1772,7 +1941,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
WaitcntData->incIterCnt();
- DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
+ LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
continue;
} else {
LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
@@ -1837,7 +2006,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (!MFI->isEntryFunction()) {
// Wait for any outstanding memory operations that the input registers may
- // depend on. We can't track them and it's better to to the wait after the
+ // depend on. We can't track them and it's better to the wait after the
// costly call sequence.
// TODO: Could insert earlier and schedule more liberally with operations
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
deleted file mode 100644
index b074b95c2d3c..000000000000
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ /dev/null
@@ -1,703 +0,0 @@
-//===- SILowerControlFlow.cpp - Use predicates for control flow -----------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Insert wait instructions for memory reads and writes.
-///
-/// Memory reads and writes are issued asynchronously, so we need to insert
-/// S_WAITCNT instructions when we want to access any of their results or
-/// overwrite any register that's used asynchronously.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <utility>
-
-#define DEBUG_TYPE "si-insert-waits"
-
-using namespace llvm;
-
-namespace {
-
-/// \brief One variable for each of the hardware counters
-using Counters = union {
- struct {
- unsigned VM;
- unsigned EXP;
- unsigned LGKM;
- } Named;
- unsigned Array[3];
-};
-
-using InstType = enum {
- OTHER,
- SMEM,
- VMEM
-};
-
-using RegCounters = Counters[512];
-using RegInterval = std::pair<unsigned, unsigned>;
-
-class SIInsertWaits : public MachineFunctionPass {
-private:
- const SISubtarget *ST = nullptr;
- const SIInstrInfo *TII = nullptr;
- const SIRegisterInfo *TRI = nullptr;
- const MachineRegisterInfo *MRI;
- AMDGPU::IsaInfo::IsaVersion ISA;
-
- /// \brief Constant zero value
- static const Counters ZeroCounts;
-
- /// \brief Hardware limits
- Counters HardwareLimits;
-
- /// \brief Counter values we have already waited on.
- Counters WaitedOn;
-
- /// \brief Counter values that we must wait on before the next counter
- /// increase.
- Counters DelayedWaitOn;
-
- /// \brief Counter values for last instruction issued.
- Counters LastIssued;
-
- /// \brief Registers used by async instructions.
- RegCounters UsedRegs;
-
- /// \brief Registers defined by async instructions.
- RegCounters DefinedRegs;
-
- /// \brief Different export instruction types seen since last wait.
- unsigned ExpInstrTypesSeen = 0;
-
- /// \brief Type of the last opcode.
- InstType LastOpcodeType;
-
- bool LastInstWritesM0;
-
- /// Whether or not we have flat operations outstanding.
- bool IsFlatOutstanding;
-
- /// \brief Whether the machine function returns void
- bool ReturnsVoid;
-
- /// Whether the VCCZ bit is possibly corrupt
- bool VCCZCorrupt = false;
-
- /// \brief Get increment/decrement amount for this instruction.
- Counters getHwCounts(MachineInstr &MI);
-
- /// \brief Is operand relevant for async execution?
- bool isOpRelevant(MachineOperand &Op);
-
- /// \brief Get register interval an operand affects.
- RegInterval getRegInterval(const TargetRegisterClass *RC,
- const MachineOperand &Reg) const;
-
- /// \brief Handle instructions async components
- void pushInstruction(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const Counters& Increment);
-
- /// \brief Insert the actual wait instruction
- bool insertWait(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const Counters &Counts);
-
- /// \brief Handle existing wait instructions (from intrinsics)
- void handleExistingWait(MachineBasicBlock::iterator I);
-
- /// \brief Do we need def2def checks?
- bool unorderedDefines(MachineInstr &MI);
-
- /// \brief Resolve all operand dependencies to counter requirements
- Counters handleOperands(MachineInstr &MI);
-
- /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
- void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
-
- /// Return true if there are LGKM instrucitons that haven't been waited on
- /// yet.
- bool hasOutstandingLGKM() const;
-
-public:
- static char ID;
-
- SIInsertWaits() : MachineFunctionPass(ID) {}
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override {
- return "SI insert wait instructions";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-};
-
-} // end anonymous namespace
-
-INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
- "SI Insert Waits", false, false)
-INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
- "SI Insert Waits", false, false)
-
-char SIInsertWaits::ID = 0;
-
-char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
-
-FunctionPass *llvm::createSIInsertWaitsPass() {
- return new SIInsertWaits();
-}
-
-const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
-
-static bool readsVCCZ(const MachineInstr &MI) {
- unsigned Opc = MI.getOpcode();
- return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
- !MI.getOperand(1).isUndef();
-}
-
-bool SIInsertWaits::hasOutstandingLGKM() const {
- return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
-}
-
-Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
- uint64_t TSFlags = MI.getDesc().TSFlags;
- Counters Result = { { 0, 0, 0 } };
-
- Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
-
- // Only consider stores or EXP for EXP_CNT
- Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
-
- // LGKM may uses larger values
- if (TSFlags & SIInstrFlags::LGKM_CNT) {
-
- if (TII->isSMRD(MI)) {
-
- if (MI.getNumOperands() != 0) {
- assert(MI.getOperand(0).isReg() &&
- "First LGKM operand must be a register!");
-
- // XXX - What if this is a write into a super register?
- const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
- unsigned Size = TRI->getRegSizeInBits(*RC);
- Result.Named.LGKM = Size > 32 ? 2 : 1;
- } else {
- // s_dcache_inv etc. do not have a a destination register. Assume we
- // want a wait on these.
- // XXX - What is the right value?
- Result.Named.LGKM = 1;
- }
- } else {
- // DS
- Result.Named.LGKM = 1;
- }
-
- } else {
- Result.Named.LGKM = 0;
- }
-
- return Result;
-}
-
-bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
- // Constants are always irrelevant
- if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
- return false;
-
- // Defines are always relevant
- if (Op.isDef())
- return true;
-
- // For exports all registers are relevant.
- // TODO: Skip undef/disabled registers.
- MachineInstr &MI = *Op.getParent();
- if (TII->isEXP(MI))
- return true;
-
- // For stores the stored value is also relevant
- if (!MI.getDesc().mayStore())
- return false;
-
- // Check if this operand is the value being stored.
- // Special case for DS/FLAT instructions, since the address
- // operand comes before the value operand and it may have
- // multiple data operands.
-
- if (TII->isDS(MI)) {
- MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
- if (Data0 && Op.isIdenticalTo(*Data0))
- return true;
-
- MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
- return Data1 && Op.isIdenticalTo(*Data1);
- }
-
- if (TII->isFLAT(MI)) {
- MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
- if (Data && Op.isIdenticalTo(*Data))
- return true;
- }
-
- // NOTE: This assumes that the value operand is before the
- // address operand, and that there is only one value operand.
- for (MachineInstr::mop_iterator I = MI.operands_begin(),
- E = MI.operands_end(); I != E; ++I) {
-
- if (I->isReg() && I->isUse())
- return Op.isIdenticalTo(*I);
- }
-
- return false;
-}
-
-RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
- const MachineOperand &Reg) const {
- unsigned Size = TRI->getRegSizeInBits(*RC);
- assert(Size >= 32);
-
- RegInterval Result;
- Result.first = TRI->getEncodingValue(Reg.getReg());
- Result.second = Result.first + Size / 32;
-
- return Result;
-}
-
-void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const Counters &Increment) {
- // Get the hardware counter increments and sum them up
- Counters Limit = ZeroCounts;
- unsigned Sum = 0;
-
- if (TII->mayAccessFlatAddressSpace(*I))
- IsFlatOutstanding = true;
-
- for (unsigned i = 0; i < 3; ++i) {
- LastIssued.Array[i] += Increment.Array[i];
- if (Increment.Array[i])
- Limit.Array[i] = LastIssued.Array[i];
- Sum += Increment.Array[i];
- }
-
- // If we don't increase anything then that's it
- if (Sum == 0) {
- LastOpcodeType = OTHER;
- return;
- }
-
- if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
- // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
- // or SMEM clause, respectively.
- //
- // The temporary workaround is to break the clauses with S_NOP.
- //
- // The proper solution would be to allocate registers such that all source
- // and destination registers don't overlap, e.g. this is illegal:
- // r0 = load r2
- // r2 = load r0
- if (LastOpcodeType == VMEM && Increment.Named.VM) {
- // Insert a NOP to break the clause.
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
- .addImm(0);
- LastInstWritesM0 = false;
- }
-
- if (TII->isSMRD(*I))
- LastOpcodeType = SMEM;
- else if (Increment.Named.VM)
- LastOpcodeType = VMEM;
- }
-
- // Remember which export instructions we have seen
- if (Increment.Named.EXP) {
- ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
- }
-
- for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
- MachineOperand &Op = I->getOperand(i);
- if (!isOpRelevant(Op))
- continue;
-
- const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
- RegInterval Interval = getRegInterval(RC, Op);
- for (unsigned j = Interval.first; j < Interval.second; ++j) {
-
- // Remember which registers we define
- if (Op.isDef())
- DefinedRegs[j] = Limit;
-
- // and which one we are using
- if (Op.isUse())
- UsedRegs[j] = Limit;
- }
- }
-}
-
-bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const Counters &Required) {
- // End of program? No need to wait on anything
- // A function not returning void needs to wait, because other bytecode will
- // be appended after it and we don't know what it will be.
- if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
- return false;
-
- // Figure out if the async instructions execute in order
- bool Ordered[3];
-
- // VM_CNT is always ordered except when there are flat instructions, which
- // can return out of order.
- Ordered[0] = !IsFlatOutstanding;
-
- // EXP_CNT is unordered if we have both EXP & VM-writes
- Ordered[1] = ExpInstrTypesSeen == 3;
-
- // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
- Ordered[2] = false;
-
- // The values we are going to put into the S_WAITCNT instruction
- Counters Counts = HardwareLimits;
-
- // Do we really need to wait?
- bool NeedWait = false;
-
- for (unsigned i = 0; i < 3; ++i) {
- if (Required.Array[i] <= WaitedOn.Array[i])
- continue;
-
- NeedWait = true;
-
- if (Ordered[i]) {
- unsigned Value = LastIssued.Array[i] - Required.Array[i];
-
- // Adjust the value to the real hardware possibilities.
- Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
- } else
- Counts.Array[i] = 0;
-
- // Remember on what we have waited on.
- WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
- }
-
- if (!NeedWait)
- return false;
-
- // Reset EXP_CNT instruction types
- if (Counts.Named.EXP == 0)
- ExpInstrTypesSeen = 0;
-
- // Build the wait instruction
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
- .addImm(AMDGPU::encodeWaitcnt(ISA,
- Counts.Named.VM,
- Counts.Named.EXP,
- Counts.Named.LGKM));
-
- LastOpcodeType = OTHER;
- LastInstWritesM0 = false;
- IsFlatOutstanding = false;
- return true;
-}
-
-/// \brief helper function for handleOperands
-static void increaseCounters(Counters &Dst, const Counters &Src) {
- for (unsigned i = 0; i < 3; ++i)
- Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
-}
-
-/// \brief check whether any of the counters is non-zero
-static bool countersNonZero(const Counters &Counter) {
- for (unsigned i = 0; i < 3; ++i)
- if (Counter.Array[i])
- return true;
- return false;
-}
-
-void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
- assert(I->getOpcode() == AMDGPU::S_WAITCNT);
-
- unsigned Imm = I->getOperand(0).getImm();
- Counters Counts, WaitOn;
-
- Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
- Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
- Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
-
- for (unsigned i = 0; i < 3; ++i) {
- if (Counts.Array[i] <= LastIssued.Array[i])
- WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
- else
- WaitOn.Array[i] = 0;
- }
-
- increaseCounters(DelayedWaitOn, WaitOn);
-}
-
-Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
- Counters Result = ZeroCounts;
-
- // For each register affected by this instruction increase the result
- // sequence.
- //
- // TODO: We could probably just look at explicit operands if we removed VCC /
- // EXEC from SMRD dest reg classes.
- for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
- MachineOperand &Op = MI.getOperand(i);
- if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
- continue;
-
- const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
- RegInterval Interval = getRegInterval(RC, Op);
- for (unsigned j = Interval.first; j < Interval.second; ++j) {
- if (Op.isDef()) {
- increaseCounters(Result, UsedRegs[j]);
- increaseCounters(Result, DefinedRegs[j]);
- }
-
- if (Op.isUse())
- increaseCounters(Result, DefinedRegs[j]);
- }
- }
-
- return Result;
-}
-
-void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) {
- if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
- return;
-
- // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
- if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
- LastInstWritesM0 = false;
- return;
- }
-
- // Set whether this instruction sets M0
- LastInstWritesM0 = false;
-
- unsigned NumOperands = I->getNumOperands();
- for (unsigned i = 0; i < NumOperands; i++) {
- const MachineOperand &Op = I->getOperand(i);
-
- if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
- LastInstWritesM0 = true;
- }
-}
-
-/// Return true if \p MBB has one successor immediately following, and is its
-/// only predecessor
-static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
- if (MBB.succ_size() != 1)
- return false;
-
- const MachineBasicBlock *Succ = *MBB.succ_begin();
- return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
-}
-
-// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
-// around other non-memory instructions.
-bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
- bool Changes = false;
-
- ST = &MF.getSubtarget<SISubtarget>();
- TII = ST->getInstrInfo();
- TRI = &TII->getRegisterInfo();
- MRI = &MF.getRegInfo();
- ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
- HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
- HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
- HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
-
- WaitedOn = ZeroCounts;
- DelayedWaitOn = ZeroCounts;
- LastIssued = ZeroCounts;
- LastOpcodeType = OTHER;
- LastInstWritesM0 = false;
- IsFlatOutstanding = false;
- ReturnsVoid = MFI->returnsVoid();
-
- memset(&UsedRegs, 0, sizeof(UsedRegs));
- memset(&DefinedRegs, 0, sizeof(DefinedRegs));
-
- SmallVector<MachineInstr *, 4> RemoveMI;
- SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
-
- bool HaveScalarStores = false;
-
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
- MachineBasicBlock &MBB = *BI;
-
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
- I != E; ++I) {
- if (!HaveScalarStores && TII->isScalarStore(*I))
- HaveScalarStores = true;
-
- if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
- // There is a hardware bug on CI/SI where SMRD instruction may corrupt
- // vccz bit, so when we detect that an instruction may read from a
- // corrupt vccz bit, we need to:
- // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
- // complete.
- // 2. Restore the correct value of vccz by writing the current value
- // of vcc back to vcc.
-
- if (TII->isSMRD(I->getOpcode())) {
- VCCZCorrupt = true;
- } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
- // FIXME: We only care about SMRD instructions here, not LDS or GDS.
- // Whenever we store a value in vcc, the correct value of vccz is
- // restored.
- VCCZCorrupt = false;
- }
-
- // Check if we need to apply the bug work-around
- if (VCCZCorrupt && readsVCCZ(*I)) {
- DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
-
- // Wait on everything, not just LGKM. vccz reads usually come from
- // terminators, and we always wait on everything at the end of the
- // block, so if we only wait on LGKM here, we might end up with
- // another s_waitcnt inserted right after this if there are non-LGKM
- // instructions still outstanding.
- insertWait(MBB, I, LastIssued);
-
- // Restore the vccz bit. Any time a value is written to vcc, the vcc
- // bit is updated, so we can restore the bit by reading the value of
- // vcc and then writing it back to the register.
- BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
- AMDGPU::VCC)
- .addReg(AMDGPU::VCC);
- }
- }
-
- // Record pre-existing, explicitly requested waits
- if (I->getOpcode() == AMDGPU::S_WAITCNT) {
- handleExistingWait(*I);
- RemoveMI.push_back(&*I);
- continue;
- }
-
- Counters Required;
-
- // Wait for everything before a barrier.
- //
- // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
- // but we also want to wait for any other outstanding transfers before
- // signalling other hardware blocks
- if ((I->getOpcode() == AMDGPU::S_BARRIER &&
- !ST->hasAutoWaitcntBeforeBarrier()) ||
- I->getOpcode() == AMDGPU::S_SENDMSG ||
- I->getOpcode() == AMDGPU::S_SENDMSGHALT)
- Required = LastIssued;
- else
- Required = handleOperands(*I);
-
- Counters Increment = getHwCounts(*I);
-
- if (countersNonZero(Required) || countersNonZero(Increment))
- increaseCounters(Required, DelayedWaitOn);
-
- Changes |= insertWait(MBB, I, Required);
-
- pushInstruction(MBB, I, Increment);
- handleSendMsg(MBB, I);
-
- if (I->getOpcode() == AMDGPU::S_ENDPGM ||
- I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
- EndPgmBlocks.push_back(&MBB);
- }
-
- // Wait for everything at the end of the MBB. If there is only one
- // successor, we can defer this until the uses there.
- if (!hasTrivialSuccessor(MBB))
- Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
- }
-
- if (HaveScalarStores) {
- // If scalar writes are used, the cache must be flushed or else the next
- // wave to reuse the same scratch memory can be clobbered.
- //
- // Insert s_dcache_wb at wave termination points if there were any scalar
- // stores, and only if the cache hasn't already been flushed. This could be
- // improved by looking across blocks for flushes in postdominating blocks
- // from the stores but an explicitly requested flush is probably very rare.
- for (MachineBasicBlock *MBB : EndPgmBlocks) {
- bool SeenDCacheWB = false;
-
- for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
- I != E; ++I) {
- if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
- SeenDCacheWB = true;
- else if (TII->isScalarStore(*I))
- SeenDCacheWB = false;
-
- // FIXME: It would be better to insert this before a waitcnt if any.
- if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
- I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
- Changes = true;
- BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
- }
- }
- }
- }
-
- for (MachineInstr *I : RemoveMI)
- I->eraseFromParent();
-
- if (!MFI->isEntryFunction()) {
- // Wait for any outstanding memory operations that the input registers may
- // depend on. We can't track them and it's better to to the wait after the
- // costly call sequence.
-
- // TODO: Could insert earlier and schedule more liberally with operations
- // that only use caller preserved registers.
- MachineBasicBlock &EntryBB = MF.front();
- BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
- .addImm(0);
-
- Changes = true;
- }
-
- return Changes;
-}
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 25917cc06e6a..b73d30940fc3 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -12,16 +12,16 @@
//===----------------------------------------------------------------------===//
def isGCN : Predicate<"Subtarget->getGeneration() "
- ">= SISubtarget::SOUTHERN_ISLANDS">,
+ ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
AssemblerPredicate<"FeatureGCN">;
def isSI : Predicate<"Subtarget->getGeneration() "
- "== SISubtarget::SOUTHERN_ISLANDS">,
+ "== AMDGPUSubtarget::SOUTHERN_ISLANDS">,
AssemblerPredicate<"FeatureSouthernIslands">;
class InstSI <dag outs, dag ins, string asm = "",
list<dag> pattern = []> :
- AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+ AMDGPUInst<outs, ins, asm, pattern>, GCNPredicateControl {
let SubtargetPredicate = isGCN;
// Low bits - basic encoding information.
@@ -118,6 +118,9 @@ class InstSI <dag outs, dag ins, string asm = "",
// This bit indicates that this is a packed VOP3P instruction
field bit IsPacked = 0;
+ // This bit indicates that this is a D16 buffer instruction.
+ field bit D16Buf = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -173,6 +176,8 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{49} = IsPacked;
+ let TSFlags{50} = D16Buf;
+
let SchedRW = [Write32Bit];
field bits<1> DisableSIDecoder = 0;
@@ -181,6 +186,9 @@ class InstSI <dag outs, dag ins, string asm = "",
let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1);
let AsmVariantName = AMDGPUAsmVariants.Default;
+
+ // Avoid changing source registers in a way that violates constant bus read limitations.
+ let hasExtraSrcRegAllocReq = !if(VOP1,1,!if(VOP2,1,!if(VOP3,1,!if(VOPC,1,!if(SDWA,1, !if(VALU,1,0))))));
}
class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
@@ -247,6 +255,7 @@ class MIMGe <bits<7> op> : Enc64 {
bits<1> tfe;
bits<1> lwe;
bits<1> slc;
+ bit d16;
bits<8> vaddr;
bits<7> srsrc;
bits<7> ssamp;
@@ -265,6 +274,7 @@ class MIMGe <bits<7> op> : Enc64 {
let Inst{47-40} = vdata;
let Inst{52-48} = srsrc{6-2};
let Inst{57-53} = ssamp{6-2};
+ let Inst{63} = d16;
}
class EXPe : Enc64 {
@@ -309,6 +319,7 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
+ let VALU = 1;
}
class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
@@ -323,15 +334,3 @@ class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
}
} // End Uses = [EXEC]
-
-class MIMG <dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern> {
-
- let VM_CNT = 1;
- let EXP_CNT = 1;
- let MIMG = 1;
- let Uses = [EXEC];
-
- let UseNamedOperandTable = 1;
- let hasSideEffects = 0; // XXX ????
-}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 61967605432e..6c85c92454c3 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8,17 +8,19 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief SI Implementation of TargetInstrInfo.
+/// SI Implementation of TargetInstrInfo.
//
//===----------------------------------------------------------------------===//
#include "SIInstrInfo.h"
#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "GCNHazardRecognizer.h"
#include "SIDefines.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
@@ -37,7 +39,6 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -53,6 +54,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
#include <cassert>
@@ -62,6 +64,19 @@
using namespace llvm;
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AMDGPUGenInstrInfo.inc"
+
+namespace llvm {
+namespace AMDGPU {
+#define GET_D16ImageDimIntrinsics_IMPL
+#define GET_ImageDimIntrinsicTable_IMPL
+#define GET_RsrcIntrinsics_IMPL
+#include "AMDGPUGenSearchableTables.inc"
+}
+}
+
+
// Must be at least 4 to be able to branch over minimum unconditional branch
// code. This is only for making it possible to write reasonably small tests for
// long branches.
@@ -69,8 +84,9 @@ static cl::opt<unsigned>
BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
cl::desc("Restrict range of branch instructions (DEBUG)"));
-SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
- : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
+SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
+ : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+ RI(ST), ST(ST) {}
//===----------------------------------------------------------------------===//
// TargetInstrInfo callbacks
@@ -89,7 +105,7 @@ static SDValue findChainOperand(SDNode *Load) {
return LastOp;
}
-/// \brief Returns true if both nodes have the same value for the given
+/// Returns true if both nodes have the same value for the given
/// operand \p Op, or if both nodes do not have this operand.
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
unsigned Opc0 = N0->getMachineOpcode();
@@ -437,6 +453,28 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
}
+// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
+// the first 16 loads will be interleaved with the stores, and the next 16 will
+// be clustered as expected. It should really split into 2 16 store batches.
+//
+// Loads are clustered until this returns false, rather than trying to schedule
+// groups of stores. This also means we have to deal with saying different
+// address space loads should be clustered, and ones which might cause bank
+// conflicts.
+//
+// This might be deprecated so it might not be worth that much effort to fix.
+bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
+ int64_t Offset0, int64_t Offset1,
+ unsigned NumLoads) const {
+ assert(Offset1 > Offset0 &&
+ "Second offset should be larger than first offset!");
+ // If we have less than 16 loads in a row, and the offsets are within 64
+ // bytes, then schedule together.
+
+ // A cacheline is 64 bytes (for global memory).
+ return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
+}
+
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg,
@@ -827,10 +865,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
- assert(SrcReg != MFI->getStackPtrOffsetReg() &&
- SrcReg != MFI->getFrameOffsetReg() &&
- SrcReg != MFI->getScratchWaveOffsetReg());
-
unsigned Size = FrameInfo.getObjectSize(FrameIndex);
unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
MachinePointerInfo PtrInfo
@@ -864,7 +898,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// needing them, and need to ensure that the reserved registers are
// correctly handled.
- FrameInfo.setStackID(FrameIndex, 1);
+ FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
if (ST.hasScalarStores()) {
// m0 is used for offset to scalar stores if used to spill.
Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
@@ -960,7 +994,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
}
- FrameInfo.setStackID(FrameIndex, 1);
+ FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
@@ -1001,7 +1035,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
unsigned FrameOffset, unsigned Size) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
DebugLoc DL = MBB.findDebugLoc(MI);
unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
unsigned WavefrontSize = ST.getWavefrontSize();
@@ -1137,7 +1171,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
switch (MI.getOpcode()) {
- default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+ default: return TargetInstrInfo::expandPostRAPseudo(MI);
case AMDGPU::S_MOV_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
@@ -1269,6 +1303,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(AMDGPU::S_MOV_B64));
break;
}
+ case TargetOpcode::BUNDLE: {
+ if (!MI.mayLoad())
+ return false;
+
+ // If it is a load it must be a memory clause
+ for (MachineBasicBlock::instr_iterator I = MI.getIterator();
+ I->isBundledWithSucc(); ++I) {
+ I->unbundleFromSucc();
+ for (MachineOperand &MO : I->operands())
+ if (MO.isReg())
+ MO.setIsInternalRead(false);
+ }
+
+ MI.eraseFromParent();
+ break;
+ }
}
return true;
}
@@ -1887,16 +1937,16 @@ unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
switch(Kind) {
case PseudoSourceValue::Stack:
case PseudoSourceValue::FixedStack:
- return AMDGPUASI.PRIVATE_ADDRESS;
+ return ST.getAMDGPUAS().PRIVATE_ADDRESS;
case PseudoSourceValue::ConstantPool:
case PseudoSourceValue::GOT:
case PseudoSourceValue::JumpTable:
case PseudoSourceValue::GlobalValueCallEntry:
case PseudoSourceValue::ExternalSymbolCallEntry:
case PseudoSourceValue::TargetCustom:
- return AMDGPUASI.CONSTANT_ADDRESS;
+ return ST.getAMDGPUAS().CONSTANT_ADDRESS;
}
- return AMDGPUASI.FLAT_ADDRESS;
+ return ST.getAMDGPUAS().FLAT_ADDRESS;
}
static void removeModOperands(MachineInstr &MI) {
@@ -2165,20 +2215,24 @@ static int64_t getFoldableImm(const MachineOperand* MO) {
MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
MachineInstr &MI,
LiveVariables *LV) const {
+ unsigned Opc = MI.getOpcode();
bool IsF16 = false;
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
- switch (MI.getOpcode()) {
+ switch (Opc) {
default:
return nullptr;
case AMDGPU::V_MAC_F16_e64:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e64:
+ case AMDGPU::V_FMAC_F32_e64:
break;
case AMDGPU::V_MAC_F16_e32:
IsF16 = true;
LLVM_FALLTHROUGH;
- case AMDGPU::V_MAC_F32_e32: {
+ case AMDGPU::V_MAC_F32_e32:
+ case AMDGPU::V_FMAC_F32_e32: {
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::src0);
const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
@@ -2203,7 +2257,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
- if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
+ if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
// If we have an SGPR input, we will violate the constant bus restriction.
(!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
if (auto Imm = getFoldableImm(Src2)) {
@@ -2234,8 +2288,10 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
}
}
- return BuildMI(*MBB, MI, MI.getDebugLoc(),
- get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
+ assert((!IsFMA || !IsF16) && "fmac only expected with f32");
+ unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
+ (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
+ return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.addImm(Src0Mods ? Src0Mods->getImm() : 0)
.add(*Src0)
@@ -2339,6 +2395,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
}
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+ if (isUInt<16>(Imm)) {
+ int16_t Trunc = static_cast<int16_t>(Imm);
+ return ST.has16BitInsts() &&
+ AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+ }
+ if (!(Imm & 0xffff)) {
+ return ST.has16BitInsts() &&
+ AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm());
+ }
uint32_t Trunc = static_cast<uint32_t>(Imm);
return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
}
@@ -2711,14 +2776,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
- // Verify VOP*
- if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
+ // Verify VOP*. Ignore multiple sgpr operands on writelane.
+ if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
+ && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
// Only look at the true operands. Only a real operand can use the constant
// bus, and we don't want to check pseudo-operands like the source modifier
// flags.
const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
unsigned ConstantBusCount = 0;
+ unsigned LiteralCount = 0;
if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
++ConstantBusCount;
@@ -2738,6 +2805,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
SGPRUsed = MO.getReg();
} else {
++ConstantBusCount;
+ ++LiteralCount;
}
}
}
@@ -2745,6 +2813,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
ErrInfo = "VOP* instruction uses the constant bus more than once";
return false;
}
+
+ if (isVOP3(MI) && LiteralCount) {
+ ErrInfo = "VOP3 instruction uses literal";
+ return false;
+ }
}
// Verify misc. restrictions on specific instructions.
@@ -2842,7 +2915,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
- if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) {
+ if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
if (Offset->getImm() != 0) {
ErrInfo = "subtarget does not support offsets in flat instructions";
@@ -2850,6 +2923,22 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
+ if (DppCt) {
+ using namespace AMDGPU::DPP;
+
+ unsigned DC = DppCt->getImm();
+ if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
+ DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
+ (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
+ (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
+ (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
+ (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) {
+ ErrInfo = "Invalid dpp_ctrl value";
+ return false;
+ }
+ }
+
return true;
}
@@ -3147,6 +3236,29 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
legalizeOpWithMove(MI, Src0Idx);
}
+ // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
+ // both the value to write (src0) and lane select (src1). Fix up non-SGPR
+ // src0/src1 with V_READFIRSTLANE.
+ if (Opc == AMDGPU::V_WRITELANE_B32) {
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ const DebugLoc &DL = MI.getDebugLoc();
+ if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src0);
+ Src0.ChangeToRegister(Reg, false);
+ }
+ if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src1);
+ Src1.ChangeToRegister(Reg, false);
+ }
+ return;
+ }
+
// VOP2 src0 instructions support all operand types, so we don't need to check
// their legality. If src1 is already legal, we don't need to do anything.
if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
@@ -3261,6 +3373,13 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
unsigned DstReg = MRI.createVirtualRegister(SRC);
unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
+ if (SubRegs == 1) {
+ BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+ get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(SrcReg);
+ return DstReg;
+ }
+
SmallVector<unsigned, 8> SRegs;
for (unsigned i = 0; i < SubRegs; ++i) {
unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
@@ -3438,6 +3557,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
return;
}
+ // Legalize SI_INIT_M0
+ if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
+ MachineOperand &Src = MI.getOperand(0);
+ if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
+ Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
+ return;
+ }
+
// Legalize MIMG and MUBUF/MTBUF for shaders.
//
// Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
@@ -3539,8 +3666,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
} else {
// This instructions is the _OFFSET variant, so we need to convert it to
// ADDR64.
- assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
- < SISubtarget::VOLCANIC_ISLANDS &&
+ assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
+ < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
"FIXME: Need to emit flat atomics here");
MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
@@ -3676,37 +3803,37 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
continue;
case AMDGPU::S_LSHL_B32:
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_ASHR_I32:
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_LSHR_B32:
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_LSHL_B64:
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
NewOpcode = AMDGPU::V_LSHLREV_B64;
swapOperands(Inst);
}
break;
case AMDGPU::S_ASHR_I64:
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
NewOpcode = AMDGPU::V_ASHRREV_I64;
swapOperands(Inst);
}
break;
case AMDGPU::S_LSHR_B64:
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
NewOpcode = AMDGPU::V_LSHRREV_B64;
swapOperands(Inst);
}
@@ -3756,39 +3883,49 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
// FIXME: This isn't safe because the addressing mode doesn't work
// correctly if vaddr is negative.
//
- // FIXME: Handle v_add_u32 and VOP3 form. Also don't rely on immediate
- // being in src0.
- //
// FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
//
// See if we can extract an immediate offset by recognizing one of these:
// V_ADD_I32_e32 dst, imm, src1
// V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
// V_ADD will be removed by "Remove dead machine instructions".
- if (Add && Add->getOpcode() == AMDGPU::V_ADD_I32_e32) {
- const MachineOperand *Src =
- getNamedOperand(*Add, AMDGPU::OpName::src0);
-
- if (Src->isReg()) {
- auto Mov = MRI.getUniqueVRegDef(Src->getReg());
- if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
- Src = &Mov->getOperand(1);
- }
-
- if (Src) {
- if (Src->isImm())
- Offset = Src->getImm();
- else if (Src->isCImm())
- Offset = Src->getCImm()->getZExtValue();
- }
+ if (Add &&
+ (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
+ Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
+ static const unsigned SrcNames[2] = {
+ AMDGPU::OpName::src0,
+ AMDGPU::OpName::src1,
+ };
+
+ // Find a literal offset in one of source operands.
+ for (int i = 0; i < 2; i++) {
+ const MachineOperand *Src =
+ getNamedOperand(*Add, SrcNames[i]);
+
+ if (Src->isReg()) {
+ auto Mov = MRI.getUniqueVRegDef(Src->getReg());
+ if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
+ Src = &Mov->getOperand(1);
+ }
+
+ if (Src) {
+ if (Src->isImm())
+ Offset = Src->getImm();
+ else if (Src->isCImm())
+ Offset = Src->getCImm()->getZExtValue();
+ }
+
+ if (Offset && isLegalMUBUFImmOffset(Offset)) {
+ VAddr = getNamedOperand(*Add, SrcNames[!i]);
+ break;
+ }
- if (Offset && isLegalMUBUFImmOffset(Offset))
- VAddr = getNamedOperand(*Add, AMDGPU::OpName::src1);
- else
Offset = 0;
+ }
}
- BuildMI(*MBB, Inst, Inst.getDebugLoc(),
+ MachineInstr *NewInstr =
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(),
get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
.add(*VAddr) // vaddr
.add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
@@ -3797,12 +3934,17 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
.addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
.addImm(0) // slc
.addImm(0) // tfe
- .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end());
+ .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end())
+ .getInstr();
MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
VDst);
addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
Inst.eraseFromParent();
+
+ // Legalize all operands other than the offset. Notably, convert the srsrc
+ // into SGPRs using v_readfirstlane if needed.
+ legalizeOperands(*NewInstr);
continue;
}
}
@@ -3884,6 +4026,13 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
MRI.clearKillFlags(Inst.getOperand(1).getReg());
Inst.getOperand(0).setReg(DstReg);
+
+ // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
+ // these are deleted later, but at -O0 it would leave a suspicious
+ // looking illegal copy of an undef register.
+ for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
+ Inst.RemoveOperand(I);
+ Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
continue;
}
@@ -3975,17 +4124,23 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
- unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
- .add(Src0)
- .add(Src1);
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ if (ST.hasDLInsts()) {
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
+ .add(Src0)
+ .add(Src1);
+ } else {
+ unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
+ .add(Src0)
+ .add(Src1);
- unsigned Not = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), Not)
- .addReg(Xor);
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
+ .addReg(Xor);
+ }
- MRI.replaceRegWith(Dest.getReg(), Not);
- addUsersToMoveToVALUWorklist(Not, MRI, Worklist);
+ MRI.replaceRegWith(Dest.getReg(), NewDest);
+ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
}
void SIInstrInfo::splitScalar64BitUnaryOp(
@@ -4478,12 +4633,12 @@ uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
if (ST.isAmdHsaOS()) {
// Set ATC = 1. GFX9 doesn't have this bit.
- if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
+ if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
RsrcDataFormat |= (1ULL << 56);
// Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
// BTW, it disables TC L2 and therefore decreases performance.
- if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
+ if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
RsrcDataFormat |= (2ULL << 59);
}
@@ -4496,7 +4651,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
0xffffffff; // Size;
// GFX9 doesn't have ELEMENT_SIZE.
- if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
+ if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
}
@@ -4506,7 +4661,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
// If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
// Clear them unless we want a huge stride.
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
return Rsrc23;
@@ -4531,7 +4686,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
return AMDGPU::NoRegister;
assert(!MI.memoperands_empty() &&
- (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);
+ (*MI.memoperands_begin())->getAddrSpace() == ST.getAMDGPUAS().PRIVATE_ADDRESS);
FrameIndex = Addr->getIndex();
return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
@@ -4598,12 +4753,12 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
if (DescSize != 0 && DescSize != 4)
return DescSize;
+ if (isFixedSize(MI))
+ return DescSize;
+
// 4-byte instructions may have a 32-bit literal encoded after them. Check
// operands that coud ever be literals.
if (isVALU(MI) || isSALU(MI)) {
- if (isFixedSize(MI))
- return DescSize;
-
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
if (Src0Idx == -1)
return 4; // No operands.
@@ -4650,7 +4805,7 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
return true;
for (const MachineMemOperand *MMO : MI.memoperands()) {
- if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
+ if (MMO->getAddrSpace() == ST.getAMDGPUAS().FLAT_ADDRESS)
return true;
}
return false;
@@ -4817,3 +4972,70 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con
llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
}
}
+
+bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
+ if (!isSMRD(MI))
+ return false;
+
+ // Check that it is using a buffer resource.
+ int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
+ if (Idx == -1) // e.g. s_memtime
+ return false;
+
+ const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
+ return RCID == AMDGPU::SReg_128RegClassID;
+}
+
+// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
+enum SIEncodingFamily {
+ SI = 0,
+ VI = 1,
+ SDWA = 2,
+ SDWA9 = 3,
+ GFX80 = 4,
+ GFX9 = 5
+};
+
+static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
+ switch (ST.getGeneration()) {
+ default:
+ break;
+ case AMDGPUSubtarget::SOUTHERN_ISLANDS:
+ case AMDGPUSubtarget::SEA_ISLANDS:
+ return SIEncodingFamily::SI;
+ case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+ case AMDGPUSubtarget::GFX9:
+ return SIEncodingFamily::VI;
+ }
+ llvm_unreachable("Unknown subtarget generation!");
+}
+
+int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
+ SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+
+ if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
+ ST.getGeneration() >= AMDGPUSubtarget::GFX9)
+ Gen = SIEncodingFamily::GFX9;
+
+ if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
+ Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
+ : SIEncodingFamily::SDWA;
+ // Adjust the encoding family to GFX80 for D16 buffer instructions when the
+ // subtarget has UnpackedD16VMem feature.
+ // TODO: remove this when we discard GFX80 encoding.
+ if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
+ Gen = SIEncodingFamily::GFX80;
+
+ int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+
+ // -1 means that Opcode is already a native instruction.
+ if (MCOp == -1)
+ return Opcode;
+
+ // (uint16_t)-1 means that Opcode is a pseudo instruction that has
+ // no encoding in the given subtarget generation.
+ if (MCOp == (uint16_t)-1)
+ return -1;
+
+ return MCOp;
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 24ee843e6ade..0a735257d34e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface definition for SIInstrInfo.
+/// Interface definition for SIInstrInfo.
//
//===----------------------------------------------------------------------===//
@@ -31,20 +31,23 @@
#include <cassert>
#include <cstdint>
+#define GET_INSTRINFO_HEADER
+#include "AMDGPUGenInstrInfo.inc"
+
namespace llvm {
class APInt;
class MachineRegisterInfo;
class RegScavenger;
-class SISubtarget;
+class GCNSubtarget;
class TargetRegisterClass;
-class SIInstrInfo final : public AMDGPUInstrInfo {
+class SIInstrInfo final : public AMDGPUGenInstrInfo {
private:
const SIRegisterInfo RI;
- const SISubtarget &ST;
+ const GCNSubtarget &ST;
- // The the inverse predicate should have the negative value.
+ // The inverse predicate should have the negative value.
enum BranchPredicate {
INVALID_BR = 0,
SCC_TRUE = 1,
@@ -144,7 +147,7 @@ public:
MO_REL32_HI = 5
};
- explicit SIInstrInfo(const SISubtarget &ST);
+ explicit SIInstrInfo(const GCNSubtarget &ST);
const SIRegisterInfo &getRegisterInfo() const {
return RI;
@@ -163,7 +166,10 @@ public:
bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
MachineInstr &SecondLdSt, unsigned BaseReg2,
- unsigned NumLoads) const final;
+ unsigned NumLoads) const override;
+
+ bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
+ int64_t Offset1, unsigned NumLoads) const override;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
@@ -203,7 +209,7 @@ public:
bool expandPostRAPseudo(MachineInstr &MI) const override;
- // \brief Returns an opcode that can be used to move a value to a \p DstRC
+ // Returns an opcode that can be used to move a value to a \p DstRC
// register. If there is no hardware instruction that can store to \p
// DstRC, then AMDGPU::COPY is returned.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
@@ -419,18 +425,7 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::SMRD;
}
- bool isBufferSMRD(const MachineInstr &MI) const {
- if (!isSMRD(MI))
- return false;
-
- // Check that it is using a buffer resource.
- int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
- if (Idx == -1) // e.g. s_memtime
- return false;
-
- const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
- return RCID == AMDGPU::SReg_128RegClassID;
- }
+ bool isBufferSMRD(const MachineInstr &MI) const;
static bool isDS(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::DS;
@@ -674,16 +669,16 @@ public:
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
const MachineOperand &MO) const;
- /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
+ /// Return true if this 64-bit VALU instruction has a 32-bit encoding.
/// This function will return false if you pass it a 32-bit instruction.
bool hasVALU32BitEncoding(unsigned Opcode) const;
- /// \brief Returns true if this operand uses the constant bus.
+ /// Returns true if this operand uses the constant bus.
bool usesConstantBus(const MachineRegisterInfo &MRI,
const MachineOperand &MO,
const MCOperandInfo &OpInfo) const;
- /// \brief Return true if this instruction has any modifiers.
+ /// Return true if this instruction has any modifiers.
/// e.g. src[012]_mod, omod, clamp.
bool hasModifiers(unsigned Opcode) const;
@@ -696,7 +691,7 @@ public:
unsigned getVALUOp(const MachineInstr &MI) const;
- /// \brief Return the correct register class for \p OpNo. For target-specific
+ /// Return the correct register class for \p OpNo. For target-specific
/// instructions, this will return the register class that has been defined
/// in tablegen. For generic instructions, like REG_SEQUENCE it will return
/// the register class of its machine operand.
@@ -704,7 +699,7 @@ public:
const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
unsigned OpNo) const;
- /// \brief Return the size in bytes of the operand OpNo on the given
+ /// Return the size in bytes of the operand OpNo on the given
// instruction opcode.
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo];
@@ -718,7 +713,7 @@ public:
return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8;
}
- /// \brief This form should usually be preferred since it handles operands
+ /// This form should usually be preferred since it handles operands
/// with unknown register classes.
unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
@@ -728,7 +723,7 @@ public:
/// to read a VGPR.
bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
- /// \brief Legalize the \p OpIndex operand of this instruction by inserting
+ /// Legalize the \p OpIndex operand of this instruction by inserting
/// a MOV. For example:
/// ADD_I32_e32 VGPR0, 15
/// to
@@ -739,29 +734,29 @@ public:
/// instead of MOV.
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const;
- /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand
+ /// Check if \p MO is a legal operand if it was the \p OpIdx Operand
/// for \p MI.
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand *MO = nullptr) const;
- /// \brief Check if \p MO would be a valid operand for the given operand
+ /// Check if \p MO would be a valid operand for the given operand
/// definition \p OpInfo. Note this does not attempt to validate constant bus
/// restrictions (e.g. literal constant usage).
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI,
const MCOperandInfo &OpInfo,
const MachineOperand &MO) const;
- /// \brief Check if \p MO (a register operand) is a legal register for the
+ /// Check if \p MO (a register operand) is a legal register for the
/// given operand description.
bool isLegalRegOperand(const MachineRegisterInfo &MRI,
const MCOperandInfo &OpInfo,
const MachineOperand &MO) const;
- /// \brief Legalize operands in \p MI by either commuting it or inserting a
+ /// Legalize operands in \p MI by either commuting it or inserting a
/// copy of src1.
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
- /// \brief Fix operands in \p MI to satisfy constant bus requirements.
+ /// Fix operands in \p MI to satisfy constant bus requirements.
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const;
/// Copy a value from a VGPR (\p SrcReg) to SGPR. This function can only
@@ -779,11 +774,11 @@ public:
MachineOperand &Op, MachineRegisterInfo &MRI,
const DebugLoc &DL) const;
- /// \brief Legalize all operands in this instruction. This function may
+ /// Legalize all operands in this instruction. This function may
/// create new instruction and insert them before \p MI.
void legalizeOperands(MachineInstr &MI) const;
- /// \brief Replace this instruction's opcode with the equivalent VALU
+ /// Replace this instruction's opcode with the equivalent VALU
/// opcode. This function will also move the users of \p MI to the
/// VALU if necessary.
void moveToVALU(MachineInstr &MI) const;
@@ -795,11 +790,11 @@ public:
MachineBasicBlock::iterator MI) const override;
void insertReturn(MachineBasicBlock &MBB) const;
- /// \brief Return the number of wait states that result from executing this
+ /// Return the number of wait states that result from executing this
/// instruction.
unsigned getNumWaitStates(const MachineInstr &MI) const;
- /// \brief Returns the operand named \p Op. If \p MI does not have an
+ /// Returns the operand named \p Op. If \p MI does not have an
/// operand named \c Op, this function returns nullptr.
LLVM_READONLY
MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
@@ -822,7 +817,7 @@ public:
bool isLowLatencyInstruction(const MachineInstr &MI) const;
bool isHighLatencyInstruction(const MachineInstr &MI) const;
- /// \brief Return the descriptor of the target-specific machine instruction
+ /// Return the descriptor of the target-specific machine instruction
/// that corresponds to the specified pseudo or native opcode.
const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
return get(pseudoToMCOpcode(Opcode));
@@ -867,7 +862,7 @@ public:
bool isBasicBlockPrologue(const MachineInstr &MI) const override;
- /// \brief Return a partially built integer add instruction without carry.
+ /// Return a partially built integer add instruction without carry.
/// Caller must add source operands.
/// For pre-GFX9 it will generate unused carry destination operand.
/// TODO: After GFX9 it should return a no-carry operation.
@@ -882,6 +877,12 @@ public:
static bool isLegalMUBUFImmOffset(unsigned Imm) {
return isUInt<12>(Imm);
}
+
+ /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
+ /// Return -1 if the target-specific opcode for the pseudo instruction does
+ /// not exist. If Opcode is not a pseudo instruction, this is identity.
+ int pseudoToMCOpcode(int Opcode) const;
+
};
namespace AMDGPU {
@@ -908,6 +909,9 @@ namespace AMDGPU {
int getAddr64Inst(uint16_t Opcode);
LLVM_READONLY
+ int getMUBUFNoLdsInst(uint16_t Opcode);
+
+ LLVM_READONLY
int getAtomicRetOp(uint16_t Opcode);
LLVM_READONLY
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index fc2d35d873aa..8fa37aa83dae 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -7,16 +7,21 @@
//
//===----------------------------------------------------------------------===//
def isCI : Predicate<"Subtarget->getGeneration() "
- ">= SISubtarget::SEA_ISLANDS">;
+ ">= AMDGPUSubtarget::SEA_ISLANDS">;
def isCIOnly : Predicate<"Subtarget->getGeneration() =="
- "SISubtarget::SEA_ISLANDS">,
+ "AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate <"FeatureSeaIslands">;
def isVIOnly : Predicate<"Subtarget->getGeneration() =="
- "SISubtarget::VOLCANIC_ISLANDS">,
+ "AMDGPUSubtarget::VOLCANIC_ISLANDS">,
AssemblerPredicate <"FeatureVolcanicIslands">;
def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
+class GCNPredicateControl : PredicateControl {
+ Predicate SIAssemblerPredicate = isSICI;
+ Predicate VIAssemblerPredicate = isVI;
+}
+
// Execpt for the NONE field, this must be kept in sync with the
// SIEncodingFamily enum in AMDGPUInstrInfo.cpp
def SIEncodingFamily {
@@ -25,13 +30,16 @@ def SIEncodingFamily {
int VI = 1;
int SDWA = 2;
int SDWA9 = 3;
- int GFX9 = 4;
+ int GFX80 = 4;
+ int GFX9 = 5;
}
//===----------------------------------------------------------------------===//
// SI DAG Nodes
//===----------------------------------------------------------------------===//
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
+
def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
[SDNPMayLoad, SDNPMemOperand]
@@ -45,22 +53,41 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
-def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT",
- SDTypeProfile<1, 9,
- [ // vdata
- SDTCisVT<1, v4i32>, // rsrc
- SDTCisVT<2, i32>, // vindex(VGPR)
- SDTCisVT<3, i32>, // voffset(VGPR)
- SDTCisVT<4, i32>, // soffset(SGPR)
- SDTCisVT<5, i32>, // offset(imm)
- SDTCisVT<6, i32>, // dfmt(imm)
- SDTCisVT<7, i32>, // nfmt(imm)
- SDTCisVT<8, i32>, // glc(imm)
- SDTCisVT<9, i32> // slc(imm)
- ]>,
- [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
+ SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
+]>;
+
+def SIatomic_fadd : SDNode<"AMDGPUISD::ATOMIC_LOAD_FADD", SDTAtomic2_f32,
+ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32,
+ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
+def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
+ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SDTbuffer_load : SDTypeProfile<1, 9,
+ [ // vdata
+ SDTCisVT<1, v4i32>, // rsrc
+ SDTCisVT<2, i32>, // vindex(VGPR)
+ SDTCisVT<3, i32>, // voffset(VGPR)
+ SDTCisVT<4, i32>, // soffset(SGPR)
+ SDTCisVT<5, i32>, // offset(imm)
+ SDTCisVT<6, i32>, // dfmt(imm)
+ SDTCisVT<7, i32>, // nfmt(imm)
+ SDTCisVT<8, i32>, // glc(imm)
+ SDTCisVT<9, i32> // slc(imm)
+ ]>;
+
+def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16",
+ SDTbuffer_load,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
+
def SDTtbuffer_store : SDTypeProfile<0, 10,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
@@ -79,6 +106,9 @@ def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store
def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
SDTtbuffer_store,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16",
+ SDTtbuffer_store,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
def SDTBufferLoad : SDTypeProfile<1, 5,
[ // vdata
@@ -92,6 +122,9 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16",
+ SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SDTBufferStore : SDTypeProfile<0, 6,
[ // vdata
@@ -102,9 +135,13 @@ def SDTBufferStore : SDTypeProfile<0, 6,
SDTCisVT<5, i1>]>; // slc
def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
- [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>;
-def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore,
- [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>;
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT",
+ SDTBufferStore,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16",
+ SDTBufferStore,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
class SDBufferAtomic<string opcode> : SDNode <opcode,
SDTypeProfile<1, 5,
@@ -140,21 +177,41 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
-class SDSample<string opcode> : SDNode <opcode,
- SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>,
- SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
->;
-
-def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
-def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
-def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
-def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
-
def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
>;
//===----------------------------------------------------------------------===//
+// ValueType helpers
+//===----------------------------------------------------------------------===//
+
+// Returns 1 if the source arguments have modifiers, 0 if they do not.
+// XXX - do f16 instructions?
+class isFloatType<ValueType SrcVT> {
+ bit ret =
+ !if(!eq(SrcVT.Value, f16.Value), 1,
+ !if(!eq(SrcVT.Value, f32.Value), 1,
+ !if(!eq(SrcVT.Value, f64.Value), 1,
+ !if(!eq(SrcVT.Value, v2f16.Value), 1,
+ 0))));
+}
+
+class isIntType<ValueType SrcVT> {
+ bit ret =
+ !if(!eq(SrcVT.Value, i16.Value), 1,
+ !if(!eq(SrcVT.Value, i32.Value), 1,
+ !if(!eq(SrcVT.Value, i64.Value), 1,
+ 0)));
+}
+
+class isPackedType<ValueType SrcVT> {
+ bit ret =
+ !if(!eq(SrcVT.Value, v2i16.Value), 1,
+ !if(!eq(SrcVT.Value, v2f16.Value), 1, 0)
+ );
+}
+
+//===----------------------------------------------------------------------===//
// PatFrags for global memory operations
//===----------------------------------------------------------------------===//
@@ -163,6 +220,9 @@ defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
+def atomic_load_fadd_local : local_binary_atomic_op<SIatomic_fadd>;
+def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>;
+def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>;
//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
@@ -178,6 +238,10 @@ def AMDGPUld_glue : SDNode <"ISD::LOAD", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
>;
+def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr), [{
return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
}]>;
@@ -186,6 +250,18 @@ def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr), [{
return cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
}]>;
+def atomic_load_32_glue : PatFrag<(ops node:$ptr),
+ (AMDGPUatomic_ld_glue node:$ptr)> {
+ let IsAtomic = 1;
+ let MemoryVT = i32;
+}
+
+def atomic_load_64_glue : PatFrag<(ops node:$ptr),
+ (AMDGPUatomic_ld_glue node:$ptr)> {
+ let IsAtomic = 1;
+ let MemoryVT = i64;
+}
+
def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr), [{
return cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD;
}]>;
@@ -219,6 +295,9 @@ def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{
def load_glue_align8 : Aligned8Bytes <
(ops node:$ptr), (load_glue node:$ptr)
>;
+def load_glue_align16 : Aligned16Bytes <
+ (ops node:$ptr), (load_glue node:$ptr)
+>;
def load_local_m0 : LoadFrag<load_glue>, LocalAddress;
@@ -227,12 +306,23 @@ def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress;
def az_extloadi8_local_m0 : LoadFrag<az_extloadi8_glue>, LocalAddress;
def az_extloadi16_local_m0 : LoadFrag<az_extloadi16_glue>, LocalAddress;
def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress;
+def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress;
+def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress;
+def atomic_load_64_local_m0 : LoadFrag<atomic_load_64_glue>, LocalAddress;
def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
>;
+def AMDGPUatomic_st_glue : SDNode <"ISD::ATOMIC_STORE", SDTAtomicStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
+>;
+
+def atomic_store_glue : PatFrag<(ops node:$ptr, node:$val),
+ (AMDGPUatomic_st_glue node:$ptr, node:$val)> {
+}
+
def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr),
(AMDGPUst_glue node:$val, node:$ptr), [{
return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
@@ -262,11 +352,17 @@ def store_glue_align8 : Aligned8Bytes <
(ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
>;
+def store_glue_align16 : Aligned16Bytes <
+ (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
+>;
+
def store_local_m0 : StoreFrag<store_glue>, LocalAddress;
def truncstorei8_local_m0 : StoreFrag<truncstorei8_glue>, LocalAddress;
def truncstorei16_local_m0 : StoreFrag<truncstorei16_glue>, LocalAddress;
+def atomic_store_local_m0 : StoreFrag<AMDGPUatomic_st_glue>, LocalAddress;
def store_align8_local_m0 : StoreFrag<store_glue_align8>, LocalAddress;
+def store_align16_local_m0 : StoreFrag<store_glue_align16>, LocalAddress;
def si_setcc_uniform : PatFrag <
(ops node:$lhs, node:$rhs, node:$cond),
@@ -297,10 +393,11 @@ def lshl_rev : PatFrag <
(shl $src0, $src1)
>;
-multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
+multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
+ SDTypeProfile tc = SDTAtomic2> {
def _glue : SDNode <
- !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2,
+ !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, tc,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
>;
@@ -319,6 +416,9 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
+defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 1, SDTAtomic2_f32>;
+defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>;
+defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>;
def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
@@ -368,6 +468,12 @@ return CurDAG->getTargetConstant(
N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
}]>;
+class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{
+ uint64_t Imm = N->getZExtValue();
+ unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1;
+ return CurDAG->getTargetConstant(Bit, SDLoc(N), MVT::i1);
+}]>;
+
def SIMM16bit : PatLeaf <(imm),
[{return isInt<16>(N->getSExtValue());}]
>;
@@ -381,7 +487,7 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
}]>;
class VGPRImm <dag frag> : PatLeaf<frag, [{
- if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
return false;
}
const SIRegisterInfo *SIRI =
@@ -552,19 +658,18 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {
let ParserMatchClass = VReg32OrOffClass;
}
-class SDWASrc : RegisterOperand<VS_32> {
+class SDWASrc<ValueType vt> : RegisterOperand<VS_32> {
let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_SDWA_SRC";
+ string Type = !if(isFloatType<vt>.ret, "FP", "INT");
+ let OperandType = "OPERAND_REG_INLINE_C_"#Type#vt.Size;
+ let DecoderMethod = "decodeSDWASrc"#vt.Size;
let EncoderMethod = "getSDWASrcEncoding";
}
-def SDWASrc32 : SDWASrc {
- let DecoderMethod = "decodeSDWASrc32";
-}
-
-def SDWASrc16 : SDWASrc {
- let DecoderMethod = "decodeSDWASrc16";
-}
+def SDWASrc_i32 : SDWASrc<i32>;
+def SDWASrc_i16 : SDWASrc<i16>;
+def SDWASrc_f32 : SDWASrc<f32>;
+def SDWASrc_f16 : SDWASrc<f16>;
def SDWAVopcDst : VOPDstOperand<SReg_64> {
let OperandNamespace = "AMDGPU";
@@ -637,19 +742,20 @@ def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>;
def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
-def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
-def tfe : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
-def unorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
-def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
-def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
-def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
+def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
+def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
+def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
+def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
+def R128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
+def D16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>;
+def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
-def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
+def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
@@ -747,16 +853,23 @@ class OpSelModsMatchClass : AsmOperandClass {
def IntOpSelModsMatchClass : OpSelModsMatchClass;
def IntOpSelMods : InputMods<IntOpSelModsMatchClass>;
-def FPRegSDWAInputModsMatchClass : AsmOperandClass {
- let Name = "SDWARegWithFPInputMods";
- let ParserMethod = "parseRegWithFPInputMods";
- let PredicateMethod = "isSDWARegKind";
+class FPSDWAInputModsMatchClass <int opSize> : AsmOperandClass {
+ let Name = "SDWAWithFP"#opSize#"InputMods";
+ let ParserMethod = "parseRegOrImmWithFPInputMods";
+ let PredicateMethod = "isSDWAFP"#opSize#"Operand";
}
-def FPRegSDWAInputMods : InputMods <FPRegSDWAInputModsMatchClass> {
+def FP16SDWAInputModsMatchClass : FPSDWAInputModsMatchClass<16>;
+def FP32SDWAInputModsMatchClass : FPSDWAInputModsMatchClass<32>;
+
+class FPSDWAInputMods <FPSDWAInputModsMatchClass matchClass> :
+ InputMods <matchClass> {
let PrintMethod = "printOperandAndFPInputMods";
}
+def FP16SDWAInputMods : FPSDWAInputMods<FP16SDWAInputModsMatchClass>;
+def FP32SDWAInputMods : FPSDWAInputMods<FP32SDWAInputModsMatchClass>;
+
def FPVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithFPInputMods";
let ParserMethod = "parseRegWithFPInputMods";
@@ -767,17 +880,23 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
let PrintMethod = "printOperandAndFPInputMods";
}
-
-def IntRegSDWAInputModsMatchClass : AsmOperandClass {
- let Name = "SDWARegWithIntInputMods";
- let ParserMethod = "parseRegWithIntInputMods";
- let PredicateMethod = "isSDWARegKind";
+class IntSDWAInputModsMatchClass <int opSize> : AsmOperandClass {
+ let Name = "SDWAWithInt"#opSize#"InputMods";
+ let ParserMethod = "parseRegOrImmWithIntInputMods";
+ let PredicateMethod = "isSDWAInt"#opSize#"Operand";
}
-def IntRegSDWAInputMods : InputMods <IntRegSDWAInputModsMatchClass> {
+def Int16SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<16>;
+def Int32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32>;
+
+class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> :
+ InputMods <matchClass> {
let PrintMethod = "printOperandAndIntInputMods";
}
+def Int16SDWAInputMods : IntSDWAInputMods<Int16SDWAInputModsMatchClass>;
+def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>;
+
def IntVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithIntInputMods";
let ParserMethod = "parseRegWithIntInputMods";
@@ -1023,7 +1142,12 @@ class getVregSrcForVT<ValueType VT> {
}
class getSDWASrcForVT <ValueType VT> {
- RegisterOperand ret = !if(!eq(VT.Size, 16), SDWASrc16, SDWASrc32);
+ bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ 0));
+ RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32);
+ RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32);
+ RegisterOperand ret = !if(isFP, retFlt, retInt);
}
// Returns the register class to use for sources of VOP3 instructions for the
@@ -1064,32 +1188,6 @@ class getVOP3SrcForVT<ValueType VT> {
);
}
-// Returns 1 if the source arguments have modifiers, 0 if they do not.
-// XXX - do f16 instructions?
-class isFloatType<ValueType SrcVT> {
- bit ret =
- !if(!eq(SrcVT.Value, f16.Value), 1,
- !if(!eq(SrcVT.Value, f32.Value), 1,
- !if(!eq(SrcVT.Value, f64.Value), 1,
- !if(!eq(SrcVT.Value, v2f16.Value), 1,
- 0))));
-}
-
-class isIntType<ValueType SrcVT> {
- bit ret =
- !if(!eq(SrcVT.Value, i16.Value), 1,
- !if(!eq(SrcVT.Value, i32.Value), 1,
- !if(!eq(SrcVT.Value, i64.Value), 1,
- 0)));
-}
-
-class isPackedType<ValueType SrcVT> {
- bit ret =
- !if(!eq(SrcVT.Value, v2i16.Value), 1,
- !if(!eq(SrcVT.Value, v2f16.Value), 1, 0)
- );
-}
-
// Float or packed int
class isModifierType<ValueType SrcVT> {
bit ret =
@@ -1134,11 +1232,10 @@ class getSrcModExt <ValueType VT> {
// Return type of input modifiers operand specified input operand for SDWA
class getSrcModSDWA <ValueType VT> {
- bit isFP = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- !if(!eq(VT.Value, f64.Value), 1,
- 0)));
- Operand ret = !if(isFP, FPRegSDWAInputMods, IntRegSDWAInputMods);
+ Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods,
+ !if(!eq(VT.Value, f32.Value), FP32SDWAInputMods,
+ !if(!eq(VT.Value, i16.Value), Int16SDWAInputMods,
+ Int32SDWAInputMods)));
}
// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
@@ -1733,6 +1830,9 @@ def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
+def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>;
+def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>;
+
class Commutable_REV <string revOp, bit isOrig> {
string RevOp = revOp;
bit IsOrig = isOrig;
@@ -1747,6 +1847,8 @@ class AtomicNoRet <string noRetOp, bit isRet> {
// Interpolation opcodes
//===----------------------------------------------------------------------===//
+class VINTRPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVINTRPDst">;
+
class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
VINTRPCommon <outs, ins, "", pattern>,
SIMCInstr<opName, SIEncodingFamily.NONE> {
@@ -1823,38 +1925,6 @@ def getBasicFromSDWAOp : InstrMapping {
let ValueCols = [["Default"]];
}
-def getMaskedMIMGOp1 : InstrMapping {
- let FilterClass = "MIMG_Mask";
- let RowFields = ["Op"];
- let ColFields = ["Channels"];
- let KeyCol = ["1"];
- let ValueCols = [["2"], ["3"], ["4"] ];
-}
-
-def getMaskedMIMGOp2 : InstrMapping {
- let FilterClass = "MIMG_Mask";
- let RowFields = ["Op"];
- let ColFields = ["Channels"];
- let KeyCol = ["2"];
- let ValueCols = [["1"], ["3"], ["4"] ];
-}
-
-def getMaskedMIMGOp3 : InstrMapping {
- let FilterClass = "MIMG_Mask";
- let RowFields = ["Op"];
- let ColFields = ["Channels"];
- let KeyCol = ["3"];
- let ValueCols = [["1"], ["2"], ["4"] ];
-}
-
-def getMaskedMIMGOp4 : InstrMapping {
- let FilterClass = "MIMG_Mask";
- let RowFields = ["Op"];
- let ColFields = ["Channels"];
- let KeyCol = ["4"];
- let ValueCols = [["1"], ["2"], ["3"] ];
-}
-
// Maps an commuted opcode to its original version
def getCommuteOrig : InstrMapping {
let FilterClass = "Commutable_REV";
@@ -1882,6 +1952,11 @@ def getMCOpcodeGen : InstrMapping {
[!cast<string>(SIEncodingFamily.VI)],
[!cast<string>(SIEncodingFamily.SDWA)],
[!cast<string>(SIEncodingFamily.SDWA9)],
+ // GFX80 encoding is added to work around a multiple matching
+ // issue for buffer instructions with unpacked d16 data. This
+ // does not actually change the encoding, and thus may be
+ // removed later.
+ [!cast<string>(SIEncodingFamily.GFX80)],
[!cast<string>(SIEncodingFamily.GFX9)]];
}
@@ -1902,6 +1977,14 @@ def getAddr64Inst : InstrMapping {
let ValueCols = [["1"]];
}
+def getMUBUFNoLdsInst : InstrMapping {
+ let FilterClass = "MUBUFLdsTable";
+ let RowFields = ["OpName"];
+ let ColFields = ["IsLds"];
+ let KeyCol = ["1"];
+ let ValueCols = [["0"]];
+}
+
// Maps an atomic opcode to its version with a return value.
def getAtomicRetOp : InstrMapping {
let FilterClass = "AtomicNoRet";
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 9740a18b7248..c3f8bfb53ef4 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -11,18 +11,10 @@
// that are not yet supported remain commented out.
//===----------------------------------------------------------------------===//
-def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
-def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
-def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
- AssemblerPredicate<"FeatureVGPRIndexMode">;
-def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
- AssemblerPredicate<"FeatureMovrel">;
-
-class GCNPat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
+class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl {
let SubtargetPredicate = isGCN;
}
-
include "VOPInstructions.td"
include "SOPInstructions.td"
include "SMInstructions.td"
@@ -40,15 +32,18 @@ defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
// VINTRP Instructions
//===----------------------------------------------------------------------===//
+// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI)
+def VINTRPDst : VINTRPDstOperand <VGPR_32>;
+
let Uses = [M0, EXEC] in {
// FIXME: Specify SchedRW for VINTRP insturctions.
multiclass V_INTERP_P1_F32_m : VINTRP_m <
0x00000000,
- (outs VGPR_32:$vdst),
+ (outs VINTRPDst:$vdst),
(ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
- "v_interp_p1_f32 $vdst, $vsrc, $attr$attrchan",
+ "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
(i32 imm:$attr)))]
>;
@@ -69,9 +64,9 @@ let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
defm V_INTERP_P2_F32 : VINTRP_m <
0x00000001,
- (outs VGPR_32:$vdst),
+ (outs VINTRPDst:$vdst),
(ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
- "v_interp_p2_f32 $vdst, $vsrc, $attr$attrchan",
+ "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
(i32 imm:$attr)))]>;
@@ -79,9 +74,9 @@ defm V_INTERP_P2_F32 : VINTRP_m <
defm V_INTERP_MOV_F32 : VINTRP_m <
0x00000002,
- (outs VGPR_32:$vdst),
+ (outs VINTRPDst:$vdst),
(ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
- "v_interp_mov_f32 $vdst, $vsrc, $attr$attrchan",
+ "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
(i32 imm:$attr)))]>;
@@ -186,6 +181,7 @@ def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst),
let SALU = 1;
let isAsCheapAsAMove = 1;
let isTerminator = 1;
+ let Defs = [SCC];
}
def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst),
@@ -246,7 +242,6 @@ def SI_IF: CFPseudoInstSI <
def SI_ELSE : CFPseudoInstSI <
(outs SReg_64:$dst),
(ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
- let Constraints = "$src = $dst";
let Size = 12;
let hasSideEffects = 1;
}
@@ -296,14 +291,21 @@ def SI_ELSE_BREAK : CFPseudoInstSI <
let isReMaterializable = 1;
}
-let Uses = [EXEC], Defs = [EXEC,VCC] in {
+let Uses = [EXEC] in {
multiclass PseudoInstKill <dag ins> {
+ // Even though this pseudo can usually be expanded without an SCC def, we
+ // conservatively assume that it has an SCC def, both because it is sometimes
+ // required in degenerate cases (when V_CMPX cannot be used due to constant
+ // bus limitations) and because it allows us to avoid having to track SCC
+ // liveness across basic blocks.
+ let Defs = [EXEC,VCC,SCC] in
def _PSEUDO : PseudoInstSI <(outs), ins> {
let isConvergent = 1;
let usesCustomInserter = 1;
}
+ let Defs = [EXEC,VCC,SCC] in
def _TERMINATOR : SPseudoInstSI <(outs), ins> {
let isTerminator = 1;
}
@@ -312,6 +314,7 @@ multiclass PseudoInstKill <dag ins> {
defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>;
defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
+let Defs = [EXEC,VCC] in
def SI_ILLEGAL_COPY : SPseudoInstSI <
(outs unknown:$dst), (ins unknown:$src),
[], " ; illegal copy $src to $dst">;
@@ -371,6 +374,7 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI <
let isReturn = 1;
let hasNoSchedulingInfo = 1;
let DisableWQM = 1;
+ let FixedSize = 1;
}
// Return for returning function calls.
@@ -449,7 +453,7 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
let usesCustomInserter = 1;
}
-let Defs = [M0, EXEC],
+let Defs = [M0, EXEC, SCC],
UseNamedOperandTable = 1 in {
class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
@@ -569,11 +573,6 @@ def : GCNPat<
(SI_ELSE $src, $target, 0)
>;
-def : GCNPat <
- (int_AMDGPU_kilp),
- (SI_KILL_I1_PSEUDO (i1 0), 0)
->;
-
def : Pat <
// -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0)
(AMDGPUkill (i32 -1082130432)),
@@ -643,6 +642,11 @@ def : GCNPat <
>;
def : GCNPat <
+ (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
+ (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)), DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : GCNPat <
(f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
(V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
@@ -700,15 +704,19 @@ multiclass FMADPat <ValueType vt, Instruction inst> {
defm : FMADPat <f16, V_MAC_F16_e64>;
defm : FMADPat <f32, V_MAC_F32_e64>;
-class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : GCNPat<
- (f32 (mad_opr (VOP3Mods f32:$src0, i32:$src0_mod),
- (VOP3Mods f32:$src1, i32:$src1_mod),
- (VOP3Mods f32:$src2, i32:$src2_mod))),
+class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
+ : GCNPat<
+ (Ty (mad_opr (VOP3Mods Ty:$src0, i32:$src0_mod),
+ (VOP3Mods Ty:$src1, i32:$src1_mod),
+ (VOP3Mods Ty:$src2, i32:$src2_mod))),
(inst $src0_mod, $src0, $src1_mod, $src1,
$src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz>;
+def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>;
+def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
+ let SubtargetPredicate = Has16BitInsts;
+}
multiclass SelectPat <ValueType vt, Instruction inst> {
def : GCNPat <
@@ -726,6 +734,10 @@ def : GCNPat <
(i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
+def : GCNPat <
+ (i16 (add (i16 (trunc (ctpop i32:$popcnt))), i16:$val)),
+ (V_BCNT_U32_B32_e64 $popcnt, $val)
+>;
/********** ============================================ **********/
/********** Extraction, Insertion, Building and Casting **********/
@@ -795,6 +807,27 @@ foreach Index = 0-15 in {
>;
}
+
+def : Pat <
+ (extract_subvector v4i16:$vec, (i32 0)),
+ (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0))
+>;
+
+def : Pat <
+ (extract_subvector v4i16:$vec, (i32 2)),
+ (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1))
+>;
+
+def : Pat <
+ (extract_subvector v4f16:$vec, (i32 0)),
+ (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0))
+>;
+
+def : Pat <
+ (extract_subvector v4f16:$vec, (i32 2)),
+ (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
+>;
+
let SubtargetPredicate = isGCN in {
// FIXME: Why do only some of these type combinations for SReg and
@@ -834,6 +867,26 @@ def : BitConvert <f64, v2f32, VReg_64>;
def : BitConvert <v2f32, f64, VReg_64>;
def : BitConvert <f64, v2i32, VReg_64>;
def : BitConvert <v2i32, f64, VReg_64>;
+
+// FIXME: Make SGPR
+def : BitConvert <v2i32, v4f16, VReg_64>;
+def : BitConvert <v4f16, v2i32, VReg_64>;
+def : BitConvert <v2i32, v4f16, VReg_64>;
+def : BitConvert <v2i32, v4i16, VReg_64>;
+def : BitConvert <v4i16, v2i32, VReg_64>;
+def : BitConvert <v2f32, v4f16, VReg_64>;
+def : BitConvert <v4f16, v2f32, VReg_64>;
+def : BitConvert <v2f32, v4i16, VReg_64>;
+def : BitConvert <v4i16, v2f32, VReg_64>;
+def : BitConvert <v4i16, f64, VReg_64>;
+def : BitConvert <v4f16, f64, VReg_64>;
+def : BitConvert <f64, v4i16, VReg_64>;
+def : BitConvert <f64, v4f16, VReg_64>;
+def : BitConvert <v4i16, i64, VReg_64>;
+def : BitConvert <v4f16, i64, VReg_64>;
+def : BitConvert <i64, v4i16, VReg_64>;
+def : BitConvert <i64, v4f16, VReg_64>;
+
def : BitConvert <v4i32, v4f32, VReg_128>;
def : BitConvert <v4f32, v4i32, VReg_128>;
@@ -876,11 +929,13 @@ def : ClampPat<V_MAX_F32_e64, f32>;
def : ClampPat<V_MAX_F64, f64>;
def : ClampPat<V_MAX_F16_e64, f16>;
+let SubtargetPredicate = HasVOP3PInsts in {
def : GCNPat <
(v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))),
(V_PK_MAX_F16 $src0_modifiers, $src0,
$src0_modifiers, $src0, DSTCLAMP.ENABLE)
>;
+}
/********** ================================ **********/
/********** Floating point absolute/negative **********/
@@ -906,7 +961,7 @@ def : GCNPat <
def : GCNPat <
(fabs f32:$src),
- (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
+ (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fffffff)))
>;
def : GCNPat <
@@ -967,12 +1022,12 @@ def : GCNPat <
def : GCNPat <
(fneg f16:$src),
- (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
+ (S_XOR_B32 $src, (S_MOV_B32 (i32 0x00008000)))
>;
def : GCNPat <
(fabs f16:$src),
- (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
+ (S_AND_B32 $src, (S_MOV_B32 (i32 0x00007fff)))
>;
def : GCNPat <
@@ -982,12 +1037,12 @@ def : GCNPat <
def : GCNPat <
(fneg v2f16:$src),
- (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src)
+ (S_XOR_B32 $src, (S_MOV_B32 (i32 0x80008000)))
>;
def : GCNPat <
(fabs v2f16:$src),
- (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src)
+ (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fff7fff)))
>;
// This is really (fneg (fabs v2f16:$src))
@@ -996,7 +1051,12 @@ def : GCNPat <
// VOP3P instructions, so it is turned into the bit op.
def : GCNPat <
(fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
- (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit
+ (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
+>;
+
+def : GCNPat <
+ (fneg (v2f16 (fabs v2f16:$src))),
+ (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
>;
/********** ================== **********/
@@ -1097,6 +1157,7 @@ let SubtargetPredicate = isGCN in {
def : IMad24Pat<V_MAD_I32_I24, 1>;
def : UMad24Pat<V_MAD_U32_U24, 1>;
+// FIXME: This should only be done for VALU inputs
defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
def : ROTRPattern <V_ALIGNBIT_B32>;
@@ -1337,11 +1398,13 @@ def : GCNPat<
(V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)
>;
+let SubtargetPredicate = HasVOP3PInsts in {
def : GCNPat<
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
(V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)
>;
}
+}
let OtherPredicates = [NoFP32Denormals] in {
def : GCNPat<
@@ -1371,6 +1434,16 @@ def : GCNPat<
>;
}
+let OtherPredicates = [HasDLInsts] in {
+def : GCNPat <
+ (fma (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3NoMods f32:$src2))),
+ (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+ SRCMODS.NONE, $src2, $clamp, $omod)
+>;
+} // End OtherPredicates = [HasDLInsts]
+
// Allow integer inputs
class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
@@ -1381,11 +1454,6 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPa
def : ExpPattern<AMDGPUexport, i32, EXP>;
def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
-def : GCNPat <
- (v2i16 (build_vector i16:$src0, i16:$src1)),
- (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
->;
-
// COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs
// from S_LSHL_B32's multiple outputs from implicit scc def.
def : GCNPat <
@@ -1393,6 +1461,13 @@ def : GCNPat <
(v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0))
>;
+
+let SubtargetPredicate = HasVOP3PInsts in {
+def : GCNPat <
+ (v2i16 (build_vector i16:$src0, i16:$src1)),
+ (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
+>;
+
// With multiple uses of the shift, this will duplicate the shift and
// increase register pressure.
def : GCNPat <
@@ -1400,6 +1475,7 @@ def : GCNPat <
(v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
>;
+
def : GCNPat <
(v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
(i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
@@ -1412,6 +1488,9 @@ def : GCNPat <
(v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
>;
+} // End SubtargetPredicate = HasVOP3PInsts
+
+
// def : GCNPat <
// (v2f16 (scalar_to_vector f16:$src0)),
// (COPY $src0)
@@ -1422,6 +1501,16 @@ def : GCNPat <
// (COPY $src0)
// >;
+def : GCNPat <
+ (v4i16 (scalar_to_vector i16:$src0)),
+ (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
+>;
+
+def : GCNPat <
+ (v4f16 (scalar_to_vector f16:$src0)),
+ (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
+>;
+
//===----------------------------------------------------------------------===//
// Fract Patterns
//===----------------------------------------------------------------------===//
@@ -1486,7 +1575,7 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
-def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
+defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 84cd47a101a8..4b537540046f 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -45,6 +45,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
@@ -102,7 +103,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
};
private:
- const SISubtarget *STM = nullptr;
+ const GCNSubtarget *STM = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
MachineRegisterInfo *MRI = nullptr;
@@ -137,7 +138,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
+ StringRef getPassName() const override { return "SI Load Store Optimizer"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -150,10 +151,10 @@ public:
} // end anonymous namespace.
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
- "SI Load / Store Optimizer", false, false)
+ "SI Load Store Optimizer", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
- "SI Load / Store Optimizer", false, false)
+ "SI Load Store Optimizer", false, false)
char SILoadStoreOptimizer::ID = 0;
@@ -173,10 +174,18 @@ static void moveInstsAfter(MachineBasicBlock::iterator I,
}
}
-static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
- // XXX: Should this be looking for implicit defs?
- for (const MachineOperand &Def : MI.defs())
- Defs.insert(Def.getReg());
+static void addDefsUsesToList(const MachineInstr &MI,
+ DenseSet<unsigned> &RegDefs,
+ DenseSet<unsigned> &PhysRegUses) {
+ for (const MachineOperand &Op : MI.operands()) {
+ if (Op.isReg()) {
+ if (Op.isDef())
+ RegDefs.insert(Op.getReg());
+ else if (Op.readsReg() &&
+ TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
+ PhysRegUses.insert(Op.getReg());
+ }
+ }
}
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
@@ -194,16 +203,24 @@ static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
// already in the list. Returns true in that case.
static bool
addToListsIfDependent(MachineInstr &MI,
- DenseSet<unsigned> &Defs,
+ DenseSet<unsigned> &RegDefs,
+ DenseSet<unsigned> &PhysRegUses,
SmallVectorImpl<MachineInstr*> &Insts) {
for (MachineOperand &Use : MI.operands()) {
// If one of the defs is read, then there is a use of Def between I and the
// instruction that I will potentially be merged with. We will need to move
// this instruction after the merged instructions.
-
- if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
+ //
+ // Similarly, if there is a def which is read by an instruction that is to
+ // be moved for merging, then we need to move the def-instruction as well.
+ // This can only happen for physical registers such as M0; virtual
+ // registers are in SSA form.
+ if (Use.isReg() &&
+ ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
+ (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
+ PhysRegUses.count(Use.getReg())))) {
Insts.push_back(&MI);
- addDefsToList(MI, Defs);
+ addDefsUsesToList(MI, RegDefs, PhysRegUses);
return true;
}
}
@@ -332,8 +349,9 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
++MBBI;
- DenseSet<unsigned> DefsToMove;
- addDefsToList(*CI.I, DefsToMove);
+ DenseSet<unsigned> RegDefsToMove;
+ DenseSet<unsigned> PhysRegUsesToMove;
+ addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
for ( ; MBBI != E; ++MBBI) {
if (MBBI->getOpcode() != CI.I->getOpcode()) {
@@ -356,14 +374,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
// #2. Add this instruction to the move list and then we will check
// if condition #2 holds once we have selected the matching instruction.
CI.InstsToMove.push_back(&*MBBI);
- addDefsToList(*MBBI, DefsToMove);
+ addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
continue;
}
// When we match I with another DS instruction we will be moving I down
// to the location of the matched instruction any uses of I will need to
// be moved down as well.
- addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
+ addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
+ CI.InstsToMove);
continue;
}
@@ -377,7 +396,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
// DS_WRITE_B32 addr, f(w), idx1
// where the DS_READ_B32 ends up in InstsToMove and therefore prevents
// merging of the two writes.
- if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
+ if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
+ CI.InstsToMove))
continue;
bool Match = true;
@@ -436,7 +456,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
// down past this instruction.
// check if we can move I across MBBI and if we can move all I's users
if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+ !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
break;
}
return false;
@@ -496,13 +516,15 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
unsigned BaseReg = AddrReg->getReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
+ unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+ .addImm(CI.BaseOff);
+
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- unsigned AddOpc = STM->hasAddNoCarry() ?
- AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
- BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
- .addImm(CI.BaseOff)
+ TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
+ .addReg(ImmReg)
.addReg(AddrReg->getReg());
}
@@ -532,7 +554,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
- DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
+ LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
return Next;
}
@@ -556,7 +578,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
// sure we preserve the subregister index and any register flags set on them.
- const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
+ const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
const MachineOperand *Data1
= TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
@@ -579,17 +601,19 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = CI.I->getDebugLoc();
- unsigned BaseReg = Addr->getReg();
+ unsigned BaseReg = AddrReg->getReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
+ unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+ .addImm(CI.BaseOff);
+
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- unsigned AddOpc = STM->hasAddNoCarry() ?
- AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
- BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
- .addImm(CI.BaseOff)
- .addReg(Addr->getReg());
+ TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
+ .addReg(ImmReg)
+ .addReg(AddrReg->getReg());
}
MachineInstrBuilder Write2 =
@@ -608,7 +632,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
- DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
+ LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
return Next;
}
@@ -849,9 +873,8 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
continue;
}
- if (STM->hasSBufferLoadStoreAtomicDwordxN() &&
- (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
- Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
+ if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
+ Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
// EltSize is in units of the offset encoding.
CI.InstClass = S_BUFFER_LOAD_IMM;
CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
@@ -916,7 +939,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- STM = &MF.getSubtarget<SISubtarget>();
+ STM = &MF.getSubtarget<GCNSubtarget>();
if (!STM->loadStoreOptEnabled())
return false;
@@ -928,7 +951,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
assert(MRI->isSSA() && "Must be run on SSA");
- DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
+ LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
bool Modified = false;
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index a9af83323976..ad30317c344c 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This pass lowers the pseudo control flow instructions to real
+/// This pass lowers the pseudo control flow instructions to real
/// machine instructions.
///
/// All control flow is handled using predicated instructions and
@@ -51,6 +51,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -343,11 +344,49 @@ void SILowerControlFlow::emitBreak(MachineInstr &MI) {
}
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
- MI.setDesc(TII->get(AMDGPU::S_OR_B64));
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ auto Dst = MI.getOperand(0).getReg();
+
+ // Skip ANDing with exec if the break condition is already masked by exec
+ // because it is a V_CMP in the same basic block. (We know the break
+ // condition operand was an i1 in IR, so if it is a VALU instruction it must
+ // be one with a carry-out.)
+ bool SkipAnding = false;
+ if (MI.getOperand(1).isReg()) {
+ if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) {
+ SkipAnding = Def->getParent() == MI.getParent()
+ && SIInstrInfo::isVALU(*Def);
+ }
+ }
+
+ // AND the break condition operand with exec, then OR that into the "loop
+ // exit" mask.
+ MachineInstr *And = nullptr, *Or = nullptr;
+ if (!SkipAnding) {
+ And = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
+ .addReg(AMDGPU::EXEC)
+ .add(MI.getOperand(1));
+ Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+ .addReg(Dst)
+ .add(MI.getOperand(2));
+ } else
+ Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2));
+
+ if (LIS) {
+ if (And)
+ LIS->InsertMachineInstrInMaps(*And);
+ LIS->ReplaceMachineInstrInMaps(MI, *Or);
+ }
+
+ MI.eraseFromParent();
}
void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
- MI.setDesc(TII->get(AMDGPU::S_OR_B64));
+ // Lowered in the same way as emitIfBreak above.
+ emitIfBreak(MI);
}
void SILowerControlFlow::emitLoop(MachineInstr &MI) {
@@ -414,8 +453,8 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
return;
for (const auto &SrcOp : Def->explicit_operands())
- if (SrcOp.isUse() && (!SrcOp.isReg() ||
- TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
+ if (SrcOp.isReg() && SrcOp.isUse() &&
+ (TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
SrcOp.getReg() == AMDGPU::EXEC))
Src.push_back(SrcOp);
}
@@ -447,7 +486,7 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
}
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index da57b90dd8c4..ecc6cff407e1 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -17,6 +17,8 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPULaneDominator.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -64,7 +66,7 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
@@ -141,7 +143,8 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
DefInst->getOperand(3).getReg()) &&
TRI->getCommonSubClass(
MRI.getRegClass(DefInst->getOperand(3).getReg()),
- &AMDGPU::SGPR_64RegClass)) {
+ &AMDGPU::SGPR_64RegClass) &&
+ AMDGPU::laneDominates(DefInst->getParent(), &MBB)) {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
.add(Dst)
.addReg(AMDGPU::EXEC)
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 6013ebc81d9f..0d5ff75e37ed 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -11,6 +11,7 @@
#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUSubtarget.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -28,17 +29,12 @@ using namespace llvm;
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
- BufferPSV(*(MF.getSubtarget().getInstrInfo())),
- ImagePSV(*(MF.getSubtarget().getInstrInfo())),
PrivateSegmentBuffer(false),
DispatchPtr(false),
QueuePtr(false),
KernargSegmentPtr(false),
DispatchID(false),
FlatScratchInit(false),
- GridWorkgroupCountX(false),
- GridWorkgroupCountY(false),
- GridWorkgroupCountZ(false),
WorkGroupIDX(false),
WorkGroupIDY(false),
WorkGroupIDZ(false),
@@ -49,12 +45,26 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkItemIDZ(false),
ImplicitBufferPtr(false),
ImplicitArgPtr(false),
- GITPtrHigh(0xffffffff) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ GITPtrHigh(0xffffffff),
+ HighBitsOf32BitAddress(0) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const Function &F = MF.getFunction();
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
+ Occupancy = getMaxWavesPerEU();
+ limitOccupancy(MF);
+ CallingConv::ID CC = F.getCallingConv();
+
+ if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
+ if (!F.arg_empty())
+ KernargSegmentPtr = true;
+ WorkGroupIDX = true;
+ WorkItemIDX = true;
+ } else if (CC == CallingConv::AMDGPU_PS) {
+ PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
+ }
+
if (!isEntryFunction()) {
// Non-entry functions have no special inputs for now, other registers
// required for scratch access.
@@ -71,18 +81,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
ImplicitArgPtr = true;
} else {
- if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
- KernargSegmentPtr = true;
- }
-
- CallingConv::ID CC = F.getCallingConv();
- if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
- if (!F.arg_empty())
+ if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) {
KernargSegmentPtr = true;
- WorkGroupIDX = true;
- WorkItemIDX = true;
- } else if (CC == CallingConv::AMDGPU_PS) {
- PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
+ MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
+ MaxKernArgAlign);
+ }
}
if (ST.debuggerEmitPrologue()) {
@@ -134,7 +137,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
}
}
- bool IsCOV2 = ST.isAmdCodeObjectV2(MF);
+ bool IsCOV2 = ST.isAmdCodeObjectV2(F);
if (IsCOV2) {
if (HasStackObjects || MaySpill)
PrivateSegmentBuffer = true;
@@ -147,7 +150,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (F.hasFnAttribute("amdgpu-dispatch-id"))
DispatchID = true;
- } else if (ST.isMesaGfxShader(MF)) {
+ } else if (ST.isMesaGfxShader(F)) {
if (HasStackObjects || MaySpill)
ImplicitBufferPtr = true;
}
@@ -166,6 +169,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
StringRef S = A.getValueAsString();
if (!S.empty())
S.consumeInteger(0, GITPtrHigh);
+
+ A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
+ S = A.getValueAsString();
+ if (!S.empty())
+ S.consumeInteger(0, HighBitsOf32BitAddress);
+}
+
+void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
+ limitOccupancy(getMaxWavesPerEU());
+ const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
+ limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(),
+ MF.getFunction()));
}
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -238,7 +253,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
if (!SpillLanes.empty())
return true;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -269,10 +284,9 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
}
Optional<int> CSRSpillFI;
- if (FrameInfo.hasCalls() && CSRegs && isCalleeSavedReg(CSRegs, LaneVGPR)) {
- // TODO: Should this be a CreateSpillStackObject? This is technically a
- // weird CSR spill.
- CSRSpillFI = FrameInfo.CreateStackObject(4, 4, false);
+ if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
+ isCalleeSavedReg(CSRegs, LaneVGPR)) {
+ CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4);
}
SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
@@ -295,3 +309,29 @@ void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI)
for (auto &R : SGPRToVGPRSpills)
MFI.RemoveStackObject(R.first);
}
+
+
+/// \returns VGPR used for \p Dim' work item ID.
+unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const {
+ switch (Dim) {
+ case 0:
+ assert(hasWorkItemIDX());
+ return AMDGPU::VGPR0;
+ case 1:
+ assert(hasWorkItemIDY());
+ return AMDGPU::VGPR1;
+ case 2:
+ assert(hasWorkItemIDZ());
+ return AMDGPU::VGPR2;
+ }
+ llvm_unreachable("unexpected dimension");
+}
+
+MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
+ assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
+ return AMDGPU::SGPR0 + NumUserSGPRs;
+}
+
+MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
+ return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
+}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 5dde72910ee3..ef91d1e43075 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -16,7 +16,9 @@
#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUMachineFunction.h"
+#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Optional.h"
@@ -38,8 +40,9 @@ class TargetRegisterClass;
class AMDGPUImagePseudoSourceValue : public PseudoSourceValue {
public:
+ // TODO: Is the img rsrc useful?
explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) :
- PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { }
+ PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) {}
bool isConstant(const MachineFrameInfo *) const override {
// This should probably be true for most images, but we will start by being
@@ -48,15 +51,11 @@ public:
}
bool isAliased(const MachineFrameInfo *) const override {
- // FIXME: If we ever change image intrinsics to accept fat pointers, then
- // this could be true for some cases.
- return false;
+ return true;
}
bool mayAlias(const MachineFrameInfo *) const override {
- // FIXME: If we ever change image intrinsics to accept fat pointers, then
- // this could be true for some cases.
- return false;
+ return true;
}
};
@@ -72,15 +71,11 @@ public:
}
bool isAliased(const MachineFrameInfo *) const override {
- // FIXME: If we ever change image intrinsics to accept fat pointers, then
- // this could be true for some cases.
- return false;
+ return true;
}
bool mayAlias(const MachineFrameInfo *) const override {
- // FIXME: If we ever change image intrinsics to accept fat pointers, then
- // this could be true for some cases.
- return false;
+ return true;
}
};
@@ -135,8 +130,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
// Stack object indices for work item IDs.
std::array<int, 3> DebuggerWorkItemIDStackObjectIndices = {{0, 0, 0}};
- AMDGPUBufferPseudoSourceValue BufferPSV;
- AMDGPUImagePseudoSourceValue ImagePSV;
+ DenseMap<const Value *,
+ std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs;
+ DenseMap<const Value *,
+ std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs;
private:
unsigned LDSWaveSpillSize = 0;
@@ -146,6 +143,7 @@ private:
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
bool HasNonSpillStackObjects = false;
+ bool IsStackRealigned = false;
unsigned NumSpilledSGPRs = 0;
unsigned NumSpilledVGPRs = 0;
@@ -157,9 +155,6 @@ private:
bool KernargSegmentPtr : 1;
bool DispatchID : 1;
bool FlatScratchInit : 1;
- bool GridWorkgroupCountX : 1;
- bool GridWorkgroupCountY : 1;
- bool GridWorkgroupCountZ : 1;
// Feature bits required for inputs passed in system SGPRs.
bool WorkGroupIDX : 1; // Always initialized.
@@ -186,25 +181,25 @@ private:
// current hardware only allows a 16 bit value.
unsigned GITPtrHigh;
- MCPhysReg getNextUserSGPR() const {
- assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
- return AMDGPU::SGPR0 + NumUserSGPRs;
- }
+ unsigned HighBitsOf32BitAddress;
- MCPhysReg getNextSystemSGPR() const {
- return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
- }
+ // Current recorded maximum possible occupancy.
+ unsigned Occupancy;
+
+ MCPhysReg getNextUserSGPR() const;
+
+ MCPhysReg getNextSystemSGPR() const;
public:
struct SpilledReg {
- unsigned VGPR = AMDGPU::NoRegister;
+ unsigned VGPR = 0;
int Lane = -1;
SpilledReg() = default;
SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) {}
bool hasLane() { return Lane != -1;}
- bool hasReg() { return VGPR != AMDGPU::NoRegister;}
+ bool hasReg() { return VGPR != 0;}
};
struct SGPRSpillVGPRCSR {
@@ -244,8 +239,8 @@ public:
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
- bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }
- unsigned getTIDReg() const { return TIDReg; }
+ bool hasCalculatedTID() const { return TIDReg != 0; };
+ unsigned getTIDReg() const { return TIDReg; };
void setTIDReg(unsigned Reg) { TIDReg = Reg; }
unsigned getBytesInStackArgArea() const {
@@ -338,18 +333,6 @@ public:
return FlatScratchInit;
}
- bool hasGridWorkgroupCountX() const {
- return GridWorkgroupCountX;
- }
-
- bool hasGridWorkgroupCountY() const {
- return GridWorkgroupCountY;
- }
-
- bool hasGridWorkgroupCountZ() const {
- return GridWorkgroupCountZ;
- }
-
bool hasWorkGroupIDX() const {
return WorkGroupIDX;
}
@@ -411,6 +394,10 @@ public:
return GITPtrHigh;
}
+ unsigned get32BitAddressHighBits() const {
+ return HighBitsOf32BitAddress;
+ }
+
unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}
@@ -423,14 +410,14 @@ public:
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
- /// \brief Returns the physical register reserved for use as the resource
+ /// Returns the physical register reserved for use as the resource
/// descriptor for scratch accesses.
unsigned getScratchRSrcReg() const {
return ScratchRSrcReg;
}
void setScratchRSrcReg(unsigned Reg) {
- assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+ assert(Reg != 0 && "Should never be unset");
ScratchRSrcReg = Reg;
}
@@ -443,6 +430,7 @@ public:
}
void setStackPtrOffsetReg(unsigned Reg) {
+ assert(Reg != 0 && "Should never be unset");
StackPtrOffsetReg = Reg;
}
@@ -455,7 +443,7 @@ public:
}
void setScratchWaveOffsetReg(unsigned Reg) {
- assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+ assert(Reg != 0 && "Should never be unset");
ScratchWaveOffsetReg = Reg;
if (isEntryFunction())
FrameOffsetReg = ScratchWaveOffsetReg;
@@ -493,6 +481,14 @@ public:
HasNonSpillStackObjects = StackObject;
}
+ bool isStackRealigned() const {
+ return IsStackRealigned;
+ }
+
+ void setIsStackRealigned(bool Realigned = true) {
+ IsStackRealigned = Realigned;
+ }
+
unsigned getNumSpilledSGPRs() const {
return NumSpilledSGPRs;
}
@@ -575,7 +571,7 @@ public:
return DebuggerWorkGroupIDStackObjectIndices[Dim];
}
- /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
+ /// Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
assert(Dim < 3);
DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx;
@@ -587,7 +583,7 @@ public:
return DebuggerWorkItemIDStackObjectIndices[Dim];
}
- /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
+ /// Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
assert(Dim < 3);
DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx;
@@ -610,31 +606,51 @@ public:
}
/// \returns VGPR used for \p Dim' work item ID.
- unsigned getWorkItemIDVGPR(unsigned Dim) const {
- switch (Dim) {
- case 0:
- assert(hasWorkItemIDX());
- return AMDGPU::VGPR0;
- case 1:
- assert(hasWorkItemIDY());
- return AMDGPU::VGPR1;
- case 2:
- assert(hasWorkItemIDZ());
- return AMDGPU::VGPR2;
- }
- llvm_unreachable("unexpected dimension");
- }
+ unsigned getWorkItemIDVGPR(unsigned Dim) const;
unsigned getLDSWaveSpillSize() const {
return LDSWaveSpillSize;
}
- const AMDGPUBufferPseudoSourceValue *getBufferPSV() const {
- return &BufferPSV;
+ const AMDGPUBufferPseudoSourceValue *getBufferPSV(const SIInstrInfo &TII,
+ const Value *BufferRsrc) {
+ assert(BufferRsrc);
+ auto PSV = BufferPSVs.try_emplace(
+ BufferRsrc,
+ llvm::make_unique<AMDGPUBufferPseudoSourceValue>(TII));
+ return PSV.first->second.get();
+ }
+
+ const AMDGPUImagePseudoSourceValue *getImagePSV(const SIInstrInfo &TII,
+ const Value *ImgRsrc) {
+ assert(ImgRsrc);
+ auto PSV = ImagePSVs.try_emplace(
+ ImgRsrc,
+ llvm::make_unique<AMDGPUImagePseudoSourceValue>(TII));
+ return PSV.first->second.get();
+ }
+
+ unsigned getOccupancy() const {
+ return Occupancy;
+ }
+
+ unsigned getMinAllowedOccupancy() const {
+ if (!isMemoryBound() && !needsWaveLimiter())
+ return Occupancy;
+ return (Occupancy < 4) ? Occupancy : 4;
+ }
+
+ void limitOccupancy(const MachineFunction &MF);
+
+ void limitOccupancy(unsigned Limit) {
+ if (Occupancy > Limit)
+ Occupancy = Limit;
}
- const AMDGPUImagePseudoSourceValue *getImagePSV() const {
- return &ImagePSV;
+ void increaseOccupancy(const MachineFunction &MF, unsigned Limit) {
+ if (Occupancy < Limit)
+ Occupancy = Limit;
+ limitOccupancy(MF);
}
};
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 6b67b76652ed..18754442898f 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief SI Machine Scheduler interface
+/// SI Machine Scheduler interface
//
//===----------------------------------------------------------------------===//
@@ -16,6 +16,7 @@
#include "AMDGPU.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/LiveInterval.h"
@@ -154,6 +155,8 @@ static const char *getReasonStr(SIScheduleCandReason Reason) {
#endif
+namespace llvm {
+namespace SISched {
static bool tryLess(int TryVal, int CandVal,
SISchedulerCandidate &TryCand,
SISchedulerCandidate &Cand,
@@ -187,6 +190,8 @@ static bool tryGreater(int TryVal, int CandVal,
Cand.setRepeat(Reason);
return false;
}
+} // end namespace SISched
+} // end namespace llvm
// SIScheduleBlock //
@@ -212,7 +217,8 @@ void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
}
if (Cand.SGPRUsage > 60 &&
- tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
+ SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
+ TryCand, Cand, RegUsage))
return;
// Schedule low latency instructions as top as possible.
@@ -230,21 +236,22 @@ void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
// could go quite high, thus above the arbitrary limit of 60 will encourage
// use the already loaded constants (in order to release some SGPRs) before
// loading more.
- if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
- Cand.HasLowLatencyNonWaitedParent,
- TryCand, Cand, SIScheduleCandReason::Depth))
+ if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
+ Cand.HasLowLatencyNonWaitedParent,
+ TryCand, Cand, SIScheduleCandReason::Depth))
return;
- if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
- TryCand, Cand, SIScheduleCandReason::Depth))
+ if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
+ TryCand, Cand, SIScheduleCandReason::Depth))
return;
if (TryCand.IsLowLatency &&
- tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
- TryCand, Cand, SIScheduleCandReason::Depth))
+ SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
+ TryCand, Cand, SIScheduleCandReason::Depth))
return;
- if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
+ if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
+ TryCand, Cand, RegUsage))
return;
// Fall through to original instruction order.
@@ -1201,7 +1208,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
NextReservedID = 1;
NextNonReservedID = DAGSize + 1;
- DEBUG(dbgs() << "Coloring the graph\n");
+ LLVM_DEBUG(dbgs() << "Coloring the graph\n");
if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
colorHighLatenciesGroups();
@@ -1258,13 +1265,11 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
SIScheduleBlock *Block = CurrentBlocks[i];
Block->finalizeUnits();
}
- DEBUG(
- dbgs() << "Blocks created:\n\n";
- for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
- SIScheduleBlock *Block = CurrentBlocks[i];
- Block->printDebug(true);
- }
- );
+ LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
+ for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->printDebug(true);
+ });
}
// Two functions taken from Codegen/MachineScheduler.cpp
@@ -1274,7 +1279,7 @@ static MachineBasicBlock::iterator
nextIfDebug(MachineBasicBlock::iterator I,
MachineBasicBlock::const_iterator End) {
for (; I != End; ++I) {
- if (!I->isDebugValue())
+ if (!I->isDebugInstr())
break;
}
return I;
@@ -1284,7 +1289,7 @@ void SIScheduleBlockCreator::topologicalSort() {
unsigned DAGSize = CurrentBlocks.size();
std::vector<int> WorkList;
- DEBUG(dbgs() << "Topological Sort\n");
+ LLVM_DEBUG(dbgs() << "Topological Sort\n");
WorkList.reserve(DAGSize);
TopDownIndex2Block.resize(DAGSize);
@@ -1331,11 +1336,11 @@ void SIScheduleBlockCreator::topologicalSort() {
void SIScheduleBlockCreator::scheduleInsideBlocks() {
unsigned DAGSize = CurrentBlocks.size();
- DEBUG(dbgs() << "\nScheduling Blocks\n\n");
+ LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
// We do schedule a valid scheduling such that a Block corresponds
// to a range of instructions.
- DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
+ LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
for (unsigned i = 0, e = DAGSize; i != e; ++i) {
SIScheduleBlock *Block = CurrentBlocks[i];
Block->fastSchedule();
@@ -1389,7 +1394,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
}
- DEBUG(dbgs() << "Restoring MI Pos\n");
+ LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
// Restore old ordering (which prevents a LIS->handleMove bug).
for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
MachineBasicBlock::iterator POld = PosOld[i-1];
@@ -1403,12 +1408,10 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
}
}
- DEBUG(
- for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
- SIScheduleBlock *Block = CurrentBlocks[i];
- Block->printDebug(true);
- }
- );
+ LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->printDebug(true);
+ });
}
void SIScheduleBlockCreator::fillStats() {
@@ -1559,13 +1562,10 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
blockScheduled(Block);
}
- DEBUG(
- dbgs() << "Block Order:";
- for (SIScheduleBlock* Block : BlocksScheduled) {
- dbgs() << ' ' << Block->getID();
- }
- dbgs() << '\n';
- );
+ LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
+ : BlocksScheduled) {
+ dbgs() << ' ' << Block->getID();
+ } dbgs() << '\n';);
}
bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
@@ -1576,19 +1576,19 @@ bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
}
// Try to hide high latencies.
- if (tryLess(TryCand.LastPosHighLatParentScheduled,
- Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
+ if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
+ Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
return true;
// Schedule high latencies early so you can hide them better.
- if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
- TryCand, Cand, Latency))
+ if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
+ TryCand, Cand, Latency))
return true;
- if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
- TryCand, Cand, Depth))
+ if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
+ TryCand, Cand, Depth))
return true;
- if (tryGreater(TryCand.NumHighLatencySuccessors,
- Cand.NumHighLatencySuccessors,
- TryCand, Cand, Successor))
+ if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
+ Cand.NumHighLatencySuccessors,
+ TryCand, Cand, Successor))
return true;
return false;
}
@@ -1600,17 +1600,17 @@ bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
return true;
}
- if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
- TryCand, Cand, RegUsage))
+ if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
+ TryCand, Cand, RegUsage))
return true;
- if (tryGreater(TryCand.NumSuccessors > 0,
- Cand.NumSuccessors > 0,
- TryCand, Cand, Successor))
+ if (SISched::tryGreater(TryCand.NumSuccessors > 0,
+ Cand.NumSuccessors > 0,
+ TryCand, Cand, Successor))
return true;
- if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
+ if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
return true;
- if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
- TryCand, Cand, RegUsage))
+ if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
+ TryCand, Cand, RegUsage))
return true;
return false;
}
@@ -1628,18 +1628,17 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
maxVregUsage = VregCurrentUsage;
if (SregCurrentUsage > maxSregUsage)
maxSregUsage = SregCurrentUsage;
- DEBUG(
- dbgs() << "Picking New Blocks\n";
- dbgs() << "Available: ";
- for (SIScheduleBlock* Block : ReadyBlocks)
- dbgs() << Block->getID() << ' ';
- dbgs() << "\nCurrent Live:\n";
- for (unsigned Reg : LiveRegs)
- dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
- dbgs() << '\n';
- dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
- dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
- );
+ LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
+ for (SIScheduleBlock *Block
+ : ReadyBlocks) dbgs()
+ << Block->getID() << ' ';
+ dbgs() << "\nCurrent Live:\n";
+ for (unsigned Reg
+ : LiveRegs) dbgs()
+ << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ dbgs() << '\n';
+ dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+ dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
Cand.Block = nullptr;
for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
@@ -1671,20 +1670,18 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
if (TryCand.Reason != NoCand) {
Cand.setBest(TryCand);
Best = I;
- DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
- << getReasonStr(Cand.Reason) << '\n');
+ LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
+ << getReasonStr(Cand.Reason) << '\n');
}
}
- DEBUG(
- dbgs() << "Picking: " << Cand.Block->getID() << '\n';
- dbgs() << "Is a block with high latency instruction: "
- << (Cand.IsHighLatency ? "yes\n" : "no\n");
- dbgs() << "Position of last high latency dependency: "
- << Cand.LastPosHighLatParentScheduled << '\n';
- dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
- dbgs() << '\n';
- );
+ LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
+ dbgs() << "Is a block with high latency instruction: "
+ << (Cand.IsHighLatency ? "yes\n" : "no\n");
+ dbgs() << "Position of last high latency dependency: "
+ << Cand.LastPosHighLatParentScheduled << '\n';
+ dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
+ dbgs() << '\n';);
Block = Cand.Block;
ReadyBlocks.erase(Best);
@@ -1933,13 +1930,10 @@ void SIScheduleDAGMI::schedule()
{
SmallVector<SUnit*, 8> TopRoots, BotRoots;
SIScheduleBlockResult Best, Temp;
- DEBUG(dbgs() << "Preparing Scheduling\n");
+ LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
buildDAGWithRegPressure();
- DEBUG(
- for(SUnit& SU : SUnits)
- SU.dumpAll(this)
- );
+ LLVM_DEBUG(for (SUnit &SU : SUnits) SU.dumpAll(this));
topologicalSort();
findRootsAndBiasEdges(TopRoots, BotRoots);
@@ -2041,15 +2035,15 @@ void SIScheduleDAGMI::schedule()
scheduleMI(SU, true);
- DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
- << *SU->getInstr());
+ LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+ << *SU->getInstr());
}
assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
placeDebugValues();
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "*** Final schedule for "
<< printMBBReference(*begin()->getParent()) << " ***\n";
dumpSchedule();
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h
index d824e38504e6..0ce68ac6a897 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief SI Machine Scheduler interface
+/// SI Machine Scheduler interface
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index c73fb10b7ea0..938cdaf1ef8f 100644
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Memory legalizer - implements memory model. More information can be
+/// Memory legalizer - implements memory model. More information can be
/// found here:
/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
//
@@ -19,7 +19,9 @@
#include "AMDGPUSubtarget.h"
#include "SIDefines.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/BitmaskEnum.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -36,6 +38,7 @@
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/MathExtras.h"
#include <cassert>
#include <list>
@@ -47,42 +50,142 @@ using namespace llvm::AMDGPU;
namespace {
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+
+/// Memory operation flags. Can be ORed together.
+enum class SIMemOp {
+ NONE = 0u,
+ LOAD = 1u << 0,
+ STORE = 1u << 1,
+ LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
+};
+
+/// Position to insert a new instruction relative to an existing
+/// instruction.
+enum class Position {
+ BEFORE,
+ AFTER
+};
+
+/// The atomic synchronization scopes supported by the AMDGPU target.
+enum class SIAtomicScope {
+ NONE,
+ SINGLETHREAD,
+ WAVEFRONT,
+ WORKGROUP,
+ AGENT,
+ SYSTEM
+};
+
+/// The distinct address spaces supported by the AMDGPU target for
+/// atomic memory operation. Can be ORed toether.
+enum class SIAtomicAddrSpace {
+ NONE = 0u,
+ GLOBAL = 1u << 0,
+ LDS = 1u << 1,
+ SCRATCH = 1u << 2,
+ GDS = 1u << 3,
+ OTHER = 1u << 4,
+
+ /// The address spaces that can be accessed by a FLAT instruction.
+ FLAT = GLOBAL | LDS | SCRATCH,
+
+ /// The address spaces that support atomic instructions.
+ ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
+
+ /// All address spaces.
+ ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
+
+ LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
+};
+
+/// Sets named bit \p BitName to "true" if present in instruction \p MI.
+/// \returns Returns true if \p MI is modified, false otherwise.
+template <uint16_t BitName>
+bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
+ int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
+ if (BitIdx == -1)
+ return false;
+
+ MachineOperand &Bit = MI->getOperand(BitIdx);
+ if (Bit.getImm() != 0)
+ return false;
+
+ Bit.setImm(1);
+ return true;
+}
+
class SIMemOpInfo final {
private:
- SyncScope::ID SSID = SyncScope::System;
+
+ friend class SIMemOpAccess;
+
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+ SIAtomicScope Scope = SIAtomicScope::SYSTEM;
+ SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+ SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
+ bool IsCrossAddressSpaceOrdering = false;
bool IsNonTemporal = false;
- SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering)
- : SSID(SSID), Ordering(Ordering) {}
-
- SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering,
- AtomicOrdering FailureOrdering, bool IsNonTemporal = false)
- : SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering),
- IsNonTemporal(IsNonTemporal) {}
-
- /// \returns Info constructed from \p MI, which has at least machine memory
- /// operand.
- static Optional<SIMemOpInfo> constructFromMIWithMMO(
- const MachineBasicBlock::iterator &MI);
+ SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
+ SIAtomicScope Scope = SIAtomicScope::SYSTEM,
+ SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
+ SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
+ bool IsCrossAddressSpaceOrdering = true,
+ AtomicOrdering FailureOrdering =
+ AtomicOrdering::SequentiallyConsistent,
+ bool IsNonTemporal = false)
+ : Ordering(Ordering), FailureOrdering(FailureOrdering),
+ Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
+ InstrAddrSpace(InstrAddrSpace),
+ IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
+ IsNonTemporal(IsNonTemporal) {
+ // There is also no cross address space ordering if the ordering
+ // address space is the same as the instruction address space and
+ // only contains a single address space.
+ if ((OrderingAddrSpace == InstrAddrSpace) &&
+ isPowerOf2_32(uint32_t(InstrAddrSpace)))
+ IsCrossAddressSpaceOrdering = false;
+ }
public:
- /// \returns Synchronization scope ID of the machine instruction used to
+ /// \returns Atomic synchronization scope of the machine instruction used to
/// create this SIMemOpInfo.
- SyncScope::ID getSSID() const {
- return SSID;
+ SIAtomicScope getScope() const {
+ return Scope;
}
+
/// \returns Ordering constraint of the machine instruction used to
/// create this SIMemOpInfo.
AtomicOrdering getOrdering() const {
return Ordering;
}
+
/// \returns Failure ordering constraint of the machine instruction used to
/// create this SIMemOpInfo.
AtomicOrdering getFailureOrdering() const {
return FailureOrdering;
}
+
+ /// \returns The address spaces be accessed by the machine
+ /// instruction used to create this SiMemOpInfo.
+ SIAtomicAddrSpace getInstrAddrSpace() const {
+ return InstrAddrSpace;
+ }
+
+ /// \returns The address spaces that must be ordered by the machine
+ /// instruction used to create this SiMemOpInfo.
+ SIAtomicAddrSpace getOrderingAddrSpace() const {
+ return OrderingAddrSpace;
+ }
+
+ /// \returns Return true iff memory ordering of operations on
+ /// different address spaces is required.
+ bool getIsCrossAddressSpaceOrdering() const {
+ return IsCrossAddressSpaceOrdering;
+ }
+
/// \returns True if memory access of the machine instruction used to
/// create this SIMemOpInfo is non-temporal, false otherwise.
bool isNonTemporal() const {
@@ -95,109 +198,198 @@ public:
return Ordering != AtomicOrdering::NotAtomic;
}
+};
+
+class SIMemOpAccess final {
+private:
+
+ AMDGPUAS SIAddrSpaceInfo;
+ AMDGPUMachineModuleInfo *MMI = nullptr;
+
+ /// Reports unsupported message \p Msg for \p MI to LLVM context.
+ void reportUnsupported(const MachineBasicBlock::iterator &MI,
+ const char *Msg) const;
+
+ /// Inspects the target synchonization scope \p SSID and determines
+ /// the SI atomic scope it corresponds to, the address spaces it
+ /// covers, and whether the memory ordering applies between address
+ /// spaces.
+ Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
+ toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
+
+ /// \return Return a bit set of the address spaces accessed by \p AS.
+ SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
+
+ /// \returns Info constructed from \p MI, which has at least machine memory
+ /// operand.
+ Optional<SIMemOpInfo> constructFromMIWithMMO(
+ const MachineBasicBlock::iterator &MI) const;
+
+public:
+ /// Construct class to support accessing the machine memory operands
+ /// of instructions in the machine function \p MF.
+ SIMemOpAccess(MachineFunction &MF);
+
/// \returns Load info if \p MI is a load operation, "None" otherwise.
- static Optional<SIMemOpInfo> getLoadInfo(
- const MachineBasicBlock::iterator &MI);
+ Optional<SIMemOpInfo> getLoadInfo(
+ const MachineBasicBlock::iterator &MI) const;
+
/// \returns Store info if \p MI is a store operation, "None" otherwise.
- static Optional<SIMemOpInfo> getStoreInfo(
- const MachineBasicBlock::iterator &MI);
+ Optional<SIMemOpInfo> getStoreInfo(
+ const MachineBasicBlock::iterator &MI) const;
+
/// \returns Atomic fence info if \p MI is an atomic fence operation,
/// "None" otherwise.
- static Optional<SIMemOpInfo> getAtomicFenceInfo(
- const MachineBasicBlock::iterator &MI);
- /// \returns Atomic cmpxchg info if \p MI is an atomic cmpxchg operation,
- /// "None" otherwise.
- static Optional<SIMemOpInfo> getAtomicCmpxchgInfo(
- const MachineBasicBlock::iterator &MI);
- /// \returns Atomic rmw info if \p MI is an atomic rmw operation,
- /// "None" otherwise.
- static Optional<SIMemOpInfo> getAtomicRmwInfo(
- const MachineBasicBlock::iterator &MI);
+ Optional<SIMemOpInfo> getAtomicFenceInfo(
+ const MachineBasicBlock::iterator &MI) const;
- /// \brief Reports unknown synchronization scope used in \p MI to LLVM
- /// context.
- static void reportUnknownSyncScope(
- const MachineBasicBlock::iterator &MI);
+ /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
+ /// rmw operation, "None" otherwise.
+ Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
+ const MachineBasicBlock::iterator &MI) const;
};
-class SIMemoryLegalizer final : public MachineFunctionPass {
-private:
- /// \brief Machine module info.
- const AMDGPUMachineModuleInfo *MMI = nullptr;
+class SICacheControl {
+protected:
- /// \brief Instruction info.
+ /// Instruction info.
const SIInstrInfo *TII = nullptr;
- /// \brief Immediate for "vmcnt(0)".
- unsigned Vmcnt0Immediate = 0;
+ IsaInfo::IsaVersion IV;
- /// \brief Opcode for cache invalidation instruction (L1).
- unsigned Wbinvl1Opcode = 0;
+ SICacheControl(const GCNSubtarget &ST);
- /// \brief List of atomic pseudo instructions.
- std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
+public:
- /// \brief Sets named bit (BitName) to "true" if present in \p MI. Returns
- /// true if \p MI is modified, false otherwise.
- template <uint16_t BitName>
- bool enableNamedBit(const MachineBasicBlock::iterator &MI) const {
- int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
- if (BitIdx == -1)
- return false;
+ /// Create a cache control for the subtarget \p ST.
+ static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
+
+ /// Update \p MI memory load instruction to bypass any caches up to
+ /// the \p Scope memory scope for address spaces \p
+ /// AddrSpace. Return true iff the instruction was modified.
+ virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const = 0;
+
+ /// Update \p MI memory instruction to indicate it is
+ /// nontemporal. Return true iff the instruction was modified.
+ virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
+ const = 0;
+
+ /// Inserts any necessary instructions at position \p Pos relative
+ /// to instruction \p MI to ensure any caches associated with
+ /// address spaces \p AddrSpace for memory scopes up to memory scope
+ /// \p Scope are invalidated. Returns true iff any instructions
+ /// inserted.
+ virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const = 0;
+
+ /// Inserts any necessary instructions at position \p Pos relative
+ /// to instruction \p MI to ensure memory instructions of kind \p Op
+ /// associated with address spaces \p AddrSpace have completed as
+ /// observed by other memory instructions executing in memory scope
+ /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
+ /// ordering is between address spaces. Returns true iff any
+ /// instructions inserted.
+ virtual bool insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const = 0;
+
+ /// Virtual destructor to allow derivations to be deleted.
+ virtual ~SICacheControl() = default;
- MachineOperand &Bit = MI->getOperand(BitIdx);
- if (Bit.getImm() != 0)
- return false;
+};
- Bit.setImm(1);
- return true;
- }
+class SIGfx6CacheControl : public SICacheControl {
+protected:
- /// \brief Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
+ /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
return enableNamedBit<AMDGPU::OpName::glc>(MI);
}
- /// \brief Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
+ /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
return enableNamedBit<AMDGPU::OpName::slc>(MI);
}
- /// \brief Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI.
- /// Always returns true.
- bool insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI,
- bool Before = true) const;
- /// \brief Inserts "s_waitcnt vmcnt(0)" instruction \p Before or after \p MI.
- /// Always returns true.
- bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
- bool Before = true) const;
+public:
+
+ SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
+
+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
+
+ bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
+
+ bool insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
+};
+
+class SIGfx7CacheControl : public SIGfx6CacheControl {
+public:
- /// \brief Removes all processed atomic pseudo instructions from the current
+ SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
+
+ bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
+
+};
+
+class SIMemoryLegalizer final : public MachineFunctionPass {
+private:
+
+ /// Cache Control.
+ std::unique_ptr<SICacheControl> CC = nullptr;
+
+ /// List of atomic pseudo instructions.
+ std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
+
+ /// Return true iff instruction \p MI is a atomic instruction that
+ /// returns a result.
+ bool isAtomicRet(const MachineInstr &MI) const {
+ return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
+ }
+
+ /// Removes all processed atomic pseudo instructions from the current
/// function. Returns true if current function is modified, false otherwise.
bool removeAtomicPseudoMIs();
- /// \brief Expands load operation \p MI. Returns true if instructions are
+ /// Expands load operation \p MI. Returns true if instructions are
/// added/deleted or \p MI is modified, false otherwise.
bool expandLoad(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
- /// \brief Expands store operation \p MI. Returns true if instructions are
+ /// Expands store operation \p MI. Returns true if instructions are
/// added/deleted or \p MI is modified, false otherwise.
bool expandStore(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
- /// \brief Expands atomic fence operation \p MI. Returns true if
+ /// Expands atomic fence operation \p MI. Returns true if
/// instructions are added/deleted or \p MI is modified, false otherwise.
bool expandAtomicFence(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
- /// \brief Expands atomic cmpxchg operation \p MI. Returns true if
- /// instructions are added/deleted or \p MI is modified, false otherwise.
- bool expandAtomicCmpxchg(const SIMemOpInfo &MOI,
- MachineBasicBlock::iterator &MI);
- /// \brief Expands atomic rmw operation \p MI. Returns true if
+ /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
/// instructions are added/deleted or \p MI is modified, false otherwise.
- bool expandAtomicRmw(const SIMemOpInfo &MOI,
- MachineBasicBlock::iterator &MI);
+ bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI);
public:
static char ID;
@@ -218,48 +410,129 @@ public:
} // end namespace anonymous
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO(
- const MachineBasicBlock::iterator &MI) {
- assert(MI->getNumMemOperands() > 0);
+void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
+ const char *Msg) const {
+ const Function &Func = MI->getParent()->getParent()->getFunction();
+ DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
+ Func.getContext().diagnose(Diag);
+}
- const MachineFunction *MF = MI->getParent()->getParent();
- const AMDGPUMachineModuleInfo *MMI =
- &MF->getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
+SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
+ SIAtomicAddrSpace InstrScope) const {
+ /// TODO: For now assume OpenCL memory model which treats each
+ /// address space as having a separate happens-before relation, and
+ /// so an instruction only has ordering with respect to the address
+ /// space it accesses, and if it accesses multiple address spaces it
+ /// does not require ordering of operations in different address
+ /// spaces.
+ if (SSID == SyncScope::System)
+ return std::make_tuple(SIAtomicScope::SYSTEM,
+ SIAtomicAddrSpace::ATOMIC & InstrScope,
+ false);
+ if (SSID == MMI->getAgentSSID())
+ return std::make_tuple(SIAtomicScope::AGENT,
+ SIAtomicAddrSpace::ATOMIC & InstrScope,
+ false);
+ if (SSID == MMI->getWorkgroupSSID())
+ return std::make_tuple(SIAtomicScope::WORKGROUP,
+ SIAtomicAddrSpace::ATOMIC & InstrScope,
+ false);
+ if (SSID == MMI->getWavefrontSSID())
+ return std::make_tuple(SIAtomicScope::WAVEFRONT,
+ SIAtomicAddrSpace::ATOMIC & InstrScope,
+ false);
+ if (SSID == SyncScope::SingleThread)
+ return std::make_tuple(SIAtomicScope::SINGLETHREAD,
+ SIAtomicAddrSpace::ATOMIC & InstrScope,
+ false);
+ /// TODO: To support HSA Memory Model need to add additional memory
+ /// scopes that specify that do require cross address space
+ /// ordering.
+ return None;
+}
+
+SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
+ if (AS == SIAddrSpaceInfo.FLAT_ADDRESS)
+ return SIAtomicAddrSpace::FLAT;
+ if (AS == SIAddrSpaceInfo.GLOBAL_ADDRESS)
+ return SIAtomicAddrSpace::GLOBAL;
+ if (AS == SIAddrSpaceInfo.LOCAL_ADDRESS)
+ return SIAtomicAddrSpace::LDS;
+ if (AS == SIAddrSpaceInfo.PRIVATE_ADDRESS)
+ return SIAtomicAddrSpace::SCRATCH;
+ if (AS == SIAddrSpaceInfo.REGION_ADDRESS)
+ return SIAtomicAddrSpace::GDS;
+
+ return SIAtomicAddrSpace::OTHER;
+}
+
+SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
+ SIAddrSpaceInfo = getAMDGPUAS(MF.getTarget());
+ MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+}
+
+Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
+ const MachineBasicBlock::iterator &MI) const {
+ assert(MI->getNumMemOperands() > 0);
SyncScope::ID SSID = SyncScope::SingleThread;
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+ SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
bool IsNonTemporal = true;
// Validator should check whether or not MMOs cover the entire set of
// locations accessed by the memory instruction.
for (const auto &MMO : MI->memoperands()) {
- const auto &IsSyncScopeInclusion =
- MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
- if (!IsSyncScopeInclusion) {
- reportUnknownSyncScope(MI);
- return None;
- }
-
- SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
- Ordering =
- isStrongerThan(Ordering, MMO->getOrdering()) ?
- Ordering : MMO->getOrdering();
- FailureOrdering =
- isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
- FailureOrdering : MMO->getFailureOrdering();
+ IsNonTemporal &= MMO->isNonTemporal();
+ InstrAddrSpace |=
+ toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
+ AtomicOrdering OpOrdering = MMO->getOrdering();
+ if (OpOrdering != AtomicOrdering::NotAtomic) {
+ const auto &IsSyncScopeInclusion =
+ MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
+ if (!IsSyncScopeInclusion) {
+ reportUnsupported(MI,
+ "Unsupported non-inclusive atomic synchronization scope");
+ return None;
+ }
- if (!(MMO->getFlags() & MachineMemOperand::MONonTemporal))
- IsNonTemporal = false;
+ SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
+ Ordering =
+ isStrongerThan(Ordering, OpOrdering) ?
+ Ordering : MMO->getOrdering();
+ assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
+ MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
+ FailureOrdering =
+ isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
+ FailureOrdering : MMO->getFailureOrdering();
+ }
}
- return SIMemOpInfo(SSID, Ordering, FailureOrdering, IsNonTemporal);
+ SIAtomicScope Scope = SIAtomicScope::NONE;
+ SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+ bool IsCrossAddressSpaceOrdering = false;
+ if (Ordering != AtomicOrdering::NotAtomic) {
+ auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
+ if (!ScopeOrNone) {
+ reportUnsupported(MI, "Unsupported atomic synchronization scope");
+ return None;
+ }
+ std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
+ ScopeOrNone.getValue();
+ if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
+ ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+ reportUnsupported(MI, "Unsupported atomic address space");
+ return None;
+ }
+ }
+ return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
+ IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
}
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo(
- const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
+ const MachineBasicBlock::iterator &MI) const {
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
if (!(MI->mayLoad() && !MI->mayStore()))
@@ -267,15 +540,13 @@ Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo(
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo(SyncScope::System,
- AtomicOrdering::SequentiallyConsistent);
+ return SIMemOpInfo();
- return SIMemOpInfo::constructFromMIWithMMO(MI);
+ return constructFromMIWithMMO(MI);
}
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo(
- const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
+ const MachineBasicBlock::iterator &MI) const {
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
if (!(!MI->mayLoad() && MI->mayStore()))
@@ -283,30 +554,46 @@ Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo(
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo(SyncScope::System,
- AtomicOrdering::SequentiallyConsistent);
+ return SIMemOpInfo();
- return SIMemOpInfo::constructFromMIWithMMO(MI);
+ return constructFromMIWithMMO(MI);
}
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicFenceInfo(
- const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
+ const MachineBasicBlock::iterator &MI) const {
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
return None;
- SyncScope::ID SSID =
- static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
AtomicOrdering Ordering =
- static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
- return SIMemOpInfo(SSID, Ordering);
+ static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
+
+ SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
+ auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
+ if (!ScopeOrNone) {
+ reportUnsupported(MI, "Unsupported atomic synchronization scope");
+ return None;
+ }
+
+ SIAtomicScope Scope = SIAtomicScope::NONE;
+ SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+ bool IsCrossAddressSpaceOrdering = false;
+ std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
+ ScopeOrNone.getValue();
+
+ if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
+ ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+ reportUnsupported(MI, "Unsupported atomic address space");
+ return None;
+ }
+
+ return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
+ IsCrossAddressSpaceOrdering);
}
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgInfo(
- const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
+ const MachineBasicBlock::iterator &MI) const {
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
if (!(MI->mayLoad() && MI->mayStore()))
@@ -314,68 +601,251 @@ Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgInfo(
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo(SyncScope::System,
- AtomicOrdering::SequentiallyConsistent,
- AtomicOrdering::SequentiallyConsistent);
+ return SIMemOpInfo();
- return SIMemOpInfo::constructFromMIWithMMO(MI);
+ return constructFromMIWithMMO(MI);
+}
+
+SICacheControl::SICacheControl(const GCNSubtarget &ST) {
+ TII = ST.getInstrInfo();
+ IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
}
/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicRmwInfo(
- const MachineBasicBlock::iterator &MI) {
- assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
+ GCNSubtarget::Generation Generation = ST.getGeneration();
+ if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ return make_unique<SIGfx6CacheControl>(ST);
+ return make_unique<SIGfx7CacheControl>(ST);
+}
- if (!(MI->mayLoad() && MI->mayStore()))
- return None;
+bool SIGfx6CacheControl::enableLoadCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && !MI->mayStore());
+ bool Changed = false;
- // Be conservative if there are no memory operands.
- if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo(SyncScope::System,
- AtomicOrdering::SequentiallyConsistent);
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ /// TODO: Do not set glc for rmw atomic operations as they
+ /// implicitly bypass the L1 cache.
- return SIMemOpInfo::constructFromMIWithMMO(MI);
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ Changed |= enableGLCBit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not hava a cache.
+
+ return Changed;
}
-/* static */
-void SIMemOpInfo::reportUnknownSyncScope(
- const MachineBasicBlock::iterator &MI) {
- DiagnosticInfoUnsupported Diag(MI->getParent()->getParent()->getFunction(),
- "Unsupported synchronization scope");
- LLVMContext *CTX = &MI->getParent()->getParent()->getFunction().getContext();
- CTX->diagnose(Diag);
+bool SIGfx6CacheControl::enableNonTemporal(
+ const MachineBasicBlock::iterator &MI) const {
+ assert(MI->mayLoad() ^ MI->mayStore());
+ bool Changed = false;
+
+ /// TODO: Do not enableGLCBit if rmw atomic.
+ Changed |= enableGLCBit(MI);
+ Changed |= enableSLCBit(MI);
+
+ return Changed;
}
-bool SIMemoryLegalizer::insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI,
- bool Before) const {
+bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ bool Changed = false;
+
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
- if (!Before)
+ if (Pos == Position::AFTER)
++MI;
- BuildMI(MBB, MI, DL, TII->get(Wbinvl1Opcode));
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to invalidate.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not hava a cache.
- if (!Before)
+ if (Pos == Position::AFTER)
--MI;
- return true;
+ return Changed;
}
-bool SIMemoryLegalizer::insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
- bool Before) const {
+bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ bool Changed = false;
+
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
- if (!Before)
+ if (Pos == Position::AFTER)
++MI;
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Vmcnt0Immediate);
+ bool VMCnt = false;
+ bool LGKMCnt = false;
+ bool EXPCnt = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ VMCnt = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The L1 cache keeps all memory operations in order for
+ // wavefronts in the same work-group.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ // If no cross address space ordering then an LDS waitcnt is not
+ // needed as LDS operations for all waves are executed in a
+ // total global ordering as observed by all waves. Required if
+ // also synchronizing with global/GDS memory as LDS operations
+ // could be reordered with respect to later global/GDS memory
+ // operations of the same wave.
+ LGKMCnt = IsCrossAddrSpaceOrdering;
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The LDS keeps all memory operations in order for
+ // the same wavesfront.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ // If no cross address space ordering then an GDS waitcnt is not
+ // needed as GDS operations for all waves are executed in a
+ // total global ordering as observed by all waves. Required if
+ // also synchronizing with global/LDS memory as GDS operations
+ // could be reordered with respect to later global/LDS memory
+ // operations of the same wave.
+ EXPCnt = IsCrossAddrSpaceOrdering;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The GDS keeps all memory operations in order for
+ // the same work-group.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
- if (!Before)
+ if (VMCnt || LGKMCnt || EXPCnt) {
+ unsigned WaitCntImmediate =
+ AMDGPU::encodeWaitcnt(IV,
+ VMCnt ? 0 : getVmcntBitMask(IV),
+ EXPCnt ? 0 : getExpcntBitMask(IV),
+ LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+ Changed = true;
+ }
+
+ if (Pos == Position::AFTER)
--MI;
- return true;
+ return Changed;
+}
+
+bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL));
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to invalidate.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not hava a cache.
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
}
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
@@ -396,37 +866,38 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
bool Changed = false;
if (MOI.isAtomic()) {
- if (MOI.getSSID() == SyncScope::System ||
- MOI.getSSID() == MMI->getAgentSSID()) {
- if (MOI.getOrdering() == AtomicOrdering::Acquire ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= enableGLCBit(MI);
-
- if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= insertWaitcntVmcnt0(MI);
-
- if (MOI.getOrdering() == AtomicOrdering::Acquire ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
- Changed |= insertWaitcntVmcnt0(MI, false);
- Changed |= insertBufferWbinvl1Vol(MI, false);
- }
-
- return Changed;
+ if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
+ MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace());
}
- if (MOI.getSSID() == SyncScope::SingleThread ||
- MOI.getSSID() == MMI->getWorkgroupSSID() ||
- MOI.getSSID() == MMI->getWavefrontSSID()) {
- return Changed;
+ if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= CC->insertWait(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ SIMemOp::LOAD | SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
+
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |= CC->insertWait(MI, MOI.getScope(),
+ MOI.getInstrAddrSpace(),
+ SIMemOp::LOAD,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::AFTER);
+ Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ Position::AFTER);
}
- llvm_unreachable("Unsupported synchronization scope");
+ return Changed;
}
// Atomic instructions do not have the nontemporal attribute.
if (MOI.isNonTemporal()) {
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
+ Changed |= CC->enableNonTemporal(MI);
return Changed;
}
@@ -440,28 +911,20 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
bool Changed = false;
if (MOI.isAtomic()) {
- if (MOI.getSSID() == SyncScope::System ||
- MOI.getSSID() == MMI->getAgentSSID()) {
- if (MOI.getOrdering() == AtomicOrdering::Release ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= insertWaitcntVmcnt0(MI);
+ if (MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= CC->insertWait(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ SIMemOp::LOAD | SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
- return Changed;
- }
-
- if (MOI.getSSID() == SyncScope::SingleThread ||
- MOI.getSSID() == MMI->getWorkgroupSSID() ||
- MOI.getSSID() == MMI->getWavefrontSSID()) {
- return Changed;
- }
-
- llvm_unreachable("Unsupported synchronization scope");
+ return Changed;
}
// Atomic instructions do not have the nontemporal attribute.
if (MOI.isNonTemporal()) {
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
+ Changed |= CC->enableNonTemporal(MI);
return Changed;
}
@@ -472,111 +935,74 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI) {
assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
+ AtomicPseudoMIs.push_back(MI);
bool Changed = false;
if (MOI.isAtomic()) {
- if (MOI.getSSID() == SyncScope::System ||
- MOI.getSSID() == MMI->getAgentSSID()) {
- if (MOI.getOrdering() == AtomicOrdering::Acquire ||
- MOI.getOrdering() == AtomicOrdering::Release ||
- MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= insertWaitcntVmcnt0(MI);
-
- if (MOI.getOrdering() == AtomicOrdering::Acquire ||
- MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= insertBufferWbinvl1Vol(MI);
-
- AtomicPseudoMIs.push_back(MI);
- return Changed;
- }
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ /// TODO: This relies on a barrier always generating a waitcnt
+ /// for LDS to ensure it is not reordered with the completion of
+ /// the proceeding LDS operations. If barrier had a memory
+ /// ordering and memory scope, then library does not need to
+ /// generate a fence. Could add support in this file for
+ /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
+ /// adding waitcnt before a S_BARRIER.
+ Changed |= CC->insertWait(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ SIMemOp::LOAD | SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
+
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ Position::BEFORE);
- if (MOI.getSSID() == SyncScope::SingleThread ||
- MOI.getSSID() == MMI->getWorkgroupSSID() ||
- MOI.getSSID() == MMI->getWavefrontSSID()) {
- AtomicPseudoMIs.push_back(MI);
- return Changed;
- }
-
- SIMemOpInfo::reportUnknownSyncScope(MI);
- }
-
- return Changed;
-}
-
-bool SIMemoryLegalizer::expandAtomicCmpxchg(const SIMemOpInfo &MOI,
- MachineBasicBlock::iterator &MI) {
- assert(MI->mayLoad() && MI->mayStore());
-
- bool Changed = false;
-
- if (MOI.isAtomic()) {
- if (MOI.getSSID() == SyncScope::System ||
- MOI.getSSID() == MMI->getAgentSSID()) {
- if (MOI.getOrdering() == AtomicOrdering::Release ||
- MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
- MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= insertWaitcntVmcnt0(MI);
-
- if (MOI.getOrdering() == AtomicOrdering::Acquire ||
- MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
- MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
- MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
- Changed |= insertWaitcntVmcnt0(MI, false);
- Changed |= insertBufferWbinvl1Vol(MI, false);
- }
-
- return Changed;
- }
-
- if (MOI.getSSID() == SyncScope::SingleThread ||
- MOI.getSSID() == MMI->getWorkgroupSSID() ||
- MOI.getSSID() == MMI->getWavefrontSSID()) {
- Changed |= enableGLCBit(MI);
- return Changed;
- }
-
- llvm_unreachable("Unsupported synchronization scope");
+ return Changed;
}
return Changed;
}
-bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI,
- MachineBasicBlock::iterator &MI) {
+bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI) {
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
if (MOI.isAtomic()) {
- if (MOI.getSSID() == SyncScope::System ||
- MOI.getSSID() == MMI->getAgentSSID()) {
- if (MOI.getOrdering() == AtomicOrdering::Release ||
- MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= insertWaitcntVmcnt0(MI);
-
- if (MOI.getOrdering() == AtomicOrdering::Acquire ||
- MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
- Changed |= insertWaitcntVmcnt0(MI, false);
- Changed |= insertBufferWbinvl1Vol(MI, false);
- }
-
- return Changed;
+ if (MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+ MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= CC->insertWait(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ SIMemOp::LOAD | SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
+
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+ MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
+ MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |= CC->insertWait(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ isAtomicRet(*MI) ? SIMemOp::LOAD :
+ SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::AFTER);
+ Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ Position::AFTER);
}
- if (MOI.getSSID() == SyncScope::SingleThread ||
- MOI.getSSID() == MMI->getWorkgroupSSID() ||
- MOI.getSSID() == MMI->getWavefrontSSID()) {
- Changed |= enableGLCBit(MI);
- return Changed;
- }
-
- llvm_unreachable("Unsupported synchronization scope");
+ return Changed;
}
return Changed;
@@ -584,32 +1010,23 @@ bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI,
bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- const IsaInfo::IsaVersion IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
-
- MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
- TII = ST.getInstrInfo();
- Vmcnt0Immediate =
- AMDGPU::encodeWaitcnt(IV, 0, getExpcntBitMask(IV), getLgkmcntBitMask(IV));
- Wbinvl1Opcode = ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS ?
- AMDGPU::BUFFER_WBINVL1 : AMDGPU::BUFFER_WBINVL1_VOL;
+ SIMemOpAccess MOA(MF);
+ CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
for (auto &MBB : MF) {
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
continue;
- if (const auto &MOI = SIMemOpInfo::getLoadInfo(MI))
+ if (const auto &MOI = MOA.getLoadInfo(MI))
Changed |= expandLoad(MOI.getValue(), MI);
- else if (const auto &MOI = SIMemOpInfo::getStoreInfo(MI))
+ else if (const auto &MOI = MOA.getStoreInfo(MI))
Changed |= expandStore(MOI.getValue(), MI);
- else if (const auto &MOI = SIMemOpInfo::getAtomicFenceInfo(MI))
+ else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
Changed |= expandAtomicFence(MOI.getValue(), MI);
- else if (const auto &MOI = SIMemOpInfo::getAtomicCmpxchgInfo(MI))
- Changed |= expandAtomicCmpxchg(MOI.getValue(), MI);
- else if (const auto &MOI = SIMemOpInfo::getAtomicRmwInfo(MI))
- Changed |= expandAtomicRmw(MOI.getValue(), MI);
+ else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
+ Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
}
}
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 2dc6f2702b3b..ebcad30a1866 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -10,6 +10,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -76,7 +77,7 @@ static unsigned isCopyToExec(const MachineInstr &MI) {
case AMDGPU::COPY:
case AMDGPU::S_MOV_B64: {
const MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC)
+ if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC && MI.getOperand(1).isReg())
return MI.getOperand(1).getReg();
break;
}
@@ -208,7 +209,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -243,11 +244,11 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
// Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
if (CopyToExecInst->getOperand(1).isKill() &&
isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
- DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
+ LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC);
- DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
+ LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
CopyToExecInst->eraseFromParent();
}
@@ -257,7 +258,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (isLiveOut(MBB, CopyToExec)) {
// The copied register is live out and has a second use in another block.
- DEBUG(dbgs() << "Exec copy source register is live out\n");
+ LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n");
continue;
}
@@ -269,7 +270,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
= std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
J != JE; ++J) {
if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) {
- DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
+ LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
// Make sure this is inserted after any VALU ops that may have been
// scheduled in between.
SaveExecInst = nullptr;
@@ -280,8 +281,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (J->modifiesRegister(CopyToExec, TRI)) {
if (SaveExecInst) {
- DEBUG(dbgs() << "Multiple instructions modify "
- << printReg(CopyToExec, TRI) << '\n');
+ LLVM_DEBUG(dbgs() << "Multiple instructions modify "
+ << printReg(CopyToExec, TRI) << '\n');
SaveExecInst = nullptr;
break;
}
@@ -292,10 +293,11 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (ReadsCopyFromExec) {
SaveExecInst = &*J;
- DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
+ LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
continue;
} else {
- DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n');
+ LLVM_DEBUG(dbgs()
+ << "Instruction does not read exec copy: " << *J << '\n');
break;
}
} else if (ReadsCopyFromExec && !SaveExecInst) {
@@ -307,8 +309,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
// spill %sgpr0_sgpr1
// %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
//
- DEBUG(dbgs() << "Found second use of save inst candidate: "
- << *J << '\n');
+ LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J
+ << '\n');
break;
}
@@ -321,7 +323,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (!SaveExecInst)
continue;
- DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
+ LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
MachineOperand &Src0 = SaveExecInst->getOperand(1);
MachineOperand &Src1 = SaveExecInst->getOperand(2);
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 83074773c495..7b678d12ba81 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This pass removes redundant S_OR_B64 instructions enabling lanes in
+/// This pass removes redundant S_OR_B64 instructions enabling lanes in
/// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
/// vector instructions between them we can only keep outer SI_END_CF, given
/// that CFG is structured and exec bits of the outer end statement are always
@@ -23,6 +23,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -106,7 +107,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -134,7 +135,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
}
while (I != E) {
- if (I->isDebugValue()) {
+ if (I->isDebugInstr()) {
I = std::next(I);
continue;
}
@@ -143,7 +144,8 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef())
break;
- DEBUG(dbgs() << "Removing no effect instruction: " << *I << '\n');
+ LLVM_DEBUG(dbgs()
+ << "Removing no effect instruction: " << *I << '\n');
for (auto &Op : I->operands()) {
if (Op.isReg())
@@ -193,7 +195,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
!getOrExecSource(*NextLead, *TII, MRI))
continue;
- DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
+ LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
auto SaveExec = getOrExecSource(*Lead, *TII, MRI);
unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII);
@@ -224,7 +226,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
break;
}
- DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
+ LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
}
if (SafeToReplace) {
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 5ed7fdf220bf..0e000b72962e 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -25,6 +25,7 @@
#include "SIDefines.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
@@ -39,6 +40,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
@@ -86,11 +88,11 @@ public:
}
bool runOnMachineFunction(MachineFunction &MF) override;
- void matchSDWAOperands(MachineFunction &MF);
+ void matchSDWAOperands(MachineBasicBlock &MBB);
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
- bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
+ bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
- void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
+ void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
StringRef getPassName() const override { return "SI Peephole SDWA"; }
@@ -218,7 +220,7 @@ FunctionPass *llvm::createSIPeepholeSDWAPass() {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) {
+static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
switch(Sel) {
case BYTE_0: OS << "BYTE_0"; break;
case BYTE_1: OS << "BYTE_1"; break;
@@ -366,18 +368,53 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
// Find operand in instruction that matches source operand and replace it with
// target operand. Set corresponding src_sel
-
+ bool IsPreserveSrc = false;
MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
MachineOperand *SrcMods =
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
assert(Src && (Src->isReg() || Src->isImm()));
if (!isSameReg(*Src, *getReplacedOperand())) {
- // If this is not src0 then it should be src1
+ // If this is not src0 then it could be src1
Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+ if (!Src ||
+ !isSameReg(*Src, *getReplacedOperand())) {
+ // It's possible this Src is a tied operand for
+ // UNUSED_PRESERVE, in which case we can either
+ // abandon the peephole attempt, or if legal we can
+ // copy the target operand into the tied slot
+ // if the preserve operation will effectively cause the same
+ // result by overwriting the rest of the dst.
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ MachineOperand *DstUnused =
+ TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+
+ if (Dst &&
+ DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
+ // This will work if the tied src is acessing WORD_0, and the dst is
+ // writing WORD_1. Modifiers don't matter because all the bits that
+ // would be impacted are being overwritten by the dst.
+ // Any other case will not work.
+ SdwaSel DstSel = static_cast<SdwaSel>(
+ TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
+ if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
+ getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
+ IsPreserveSrc = true;
+ auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vdst);
+ auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
+ Src = &MI.getOperand(TiedIdx);
+ SrcSel = nullptr;
+ SrcMods = nullptr;
+ } else {
+ // Not legal to convert this src
+ return false;
+ }
+ }
+ }
assert(Src && Src->isReg());
if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
@@ -388,11 +425,14 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
return false;
}
- assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods);
+ assert(isSameReg(*Src, *getReplacedOperand()) &&
+ (IsPreserveSrc || (SrcSel && SrcMods)));
}
copyRegOperand(*Src, *getTargetOperand());
- SrcSel->setImm(getSrcSel());
- SrcMods->setImm(getSrcMods(TII, Src));
+ if (!IsPreserveSrc) {
+ SrcSel->setImm(getSrcSel());
+ SrcMods->setImm(getSrcMods(TII, Src));
+ }
getTargetOperand()->setIsKill(false);
return true;
}
@@ -661,7 +701,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (TRI->isPhysicalRegister(Src1->getReg()) ||
+ if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
TRI->isPhysicalRegister(Dst->getReg()))
break;
@@ -739,8 +779,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
// TODO: add support for non-SDWA instructions as OtherInst.
// For now this only works with SDWA instructions. For regular instructions
- // there is no way to determine if instruction write only 8/16/24-bit out of
- // full register size and all registers are at min 32-bit wide.
+ // there is no way to determine if the instruction writes only 8/16/24-bit
+ // out of full register size and all registers are at min 32-bit wide.
if (!TII->isSDWA(*OtherInst))
break;
@@ -804,20 +844,18 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
return std::unique_ptr<SDWAOperand>(nullptr);
}
-void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- if (auto Operand = matchSDWAOperand(MI)) {
- DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
- SDWAOperands[&MI] = std::move(Operand);
- ++NumSDWAPatternsFound;
- }
+void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
+ for (MachineInstr &MI : MBB) {
+ if (auto Operand = matchSDWAOperand(MI)) {
+ LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
+ SDWAOperands[&MI] = std::move(Operand);
+ ++NumSDWAPatternsFound;
}
}
}
bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
- const SISubtarget &ST) const {
+ const GCNSubtarget &ST) const {
// Check if this is already an SDWA instruction
unsigned Opc = MI.getOpcode();
if (TII->isSDWA(Opc))
@@ -854,11 +892,18 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
Opc == AMDGPU::V_MAC_F32_e32))
return false;
+ // FIXME: has SDWA but require handling of implicit VCC use
+ if (Opc == AMDGPU::V_CNDMASK_B32_e32)
+ return false;
+
return true;
}
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
const SDWAOperandsVector &SDWAOperands) {
+
+ LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
// Convert to sdwa
int SDWAOpcode;
unsigned Opcode = MI.getOpcode();
@@ -984,9 +1029,29 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
}
}
- // Apply all sdwa operand pattenrs
+ // Check for a preserved register that needs to be copied.
+ auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+ if (DstUnused &&
+ DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
+ // We expect, if we are here, that the instruction was already in it's SDWA form,
+ // with a tied operand.
+ assert(Dst && Dst->isTied());
+ assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
+ // We also expect a vdst, since sdst can't preserve.
+ auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+ assert(PreserveDstIdx != -1);
+
+ auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
+ auto Tied = MI.getOperand(TiedIdx);
+
+ SDWAInst.add(Tied);
+ SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+ }
+
+ // Apply all sdwa operand patterns.
bool Converted = false;
for (auto &Operand : SDWAOperands) {
+ LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
// There should be no intesection between SDWA operands and potential MIs
// e.g.:
// v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
@@ -1007,8 +1072,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
return false;
}
- DEBUG(dbgs() << "Convert instruction:" << MI
- << "Into:" << *SDWAInst << '\n');
+ LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
++NumSDWAInstructionsPeepholed;
MI.eraseFromParent();
@@ -1017,7 +1081,8 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
// If an instruction was converted to SDWA it should not have immediates or SGPR
// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
-void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const {
+void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
+ const GCNSubtarget &ST) const {
const MCInstrDesc &Desc = TII->get(MI.getOpcode());
unsigned ConstantBusCount = 0;
for (MachineOperand &Op : MI.explicit_uses()) {
@@ -1048,7 +1113,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget
}
bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
return false;
@@ -1058,35 +1123,36 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
// Find all SDWA operands in MF.
- bool Changed = false;
bool Ret = false;
- do {
- matchSDWAOperands(MF);
-
- for (const auto &OperandPair : SDWAOperands) {
- const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
- if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
- PotentialMatches[PotentialMI].push_back(Operand.get());
+ for (MachineBasicBlock &MBB : MF) {
+ bool Changed = false;
+ do {
+ matchSDWAOperands(MBB);
+
+ for (const auto &OperandPair : SDWAOperands) {
+ const auto &Operand = OperandPair.second;
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+ if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
+ PotentialMatches[PotentialMI].push_back(Operand.get());
+ }
}
- }
- for (auto &PotentialPair : PotentialMatches) {
- MachineInstr &PotentialMI = *PotentialPair.first;
- convertToSDWA(PotentialMI, PotentialPair.second);
- }
-
- PotentialMatches.clear();
- SDWAOperands.clear();
+ for (auto &PotentialPair : PotentialMatches) {
+ MachineInstr &PotentialMI = *PotentialPair.first;
+ convertToSDWA(PotentialMI, PotentialPair.second);
+ }
- Changed = !ConvertedInstructions.empty();
+ PotentialMatches.clear();
+ SDWAOperands.clear();
- if (Changed)
- Ret = true;
+ Changed = !ConvertedInstructions.empty();
- while (!ConvertedInstructions.empty())
- legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
- } while (Changed);
+ if (Changed)
+ Ret = true;
+ while (!ConvertedInstructions.empty())
+ legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
+ } while (Changed);
+ }
return Ret;
}
diff --git a/lib/Target/AMDGPU/SIProgramInfo.h b/lib/Target/AMDGPU/SIProgramInfo.h
new file mode 100644
index 000000000000..383f6b575808
--- /dev/null
+++ b/lib/Target/AMDGPU/SIProgramInfo.h
@@ -0,0 +1,77 @@
+//===--- SIProgramInfo.h ----------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Defines struct to track resource usage for kernels and entry functions.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
+
+namespace llvm {
+
+/// Track resource usage for kernels / entry functions.
+struct SIProgramInfo {
+ // Fields set in PGM_RSRC1 pm4 packet.
+ uint32_t VGPRBlocks = 0;
+ uint32_t SGPRBlocks = 0;
+ uint32_t Priority = 0;
+ uint32_t FloatMode = 0;
+ uint32_t Priv = 0;
+ uint32_t DX10Clamp = 0;
+ uint32_t DebugMode = 0;
+ uint32_t IEEEMode = 0;
+ uint64_t ScratchSize = 0;
+
+ uint64_t ComputePGMRSrc1 = 0;
+
+ // Fields set in PGM_RSRC2 pm4 packet.
+ uint32_t LDSBlocks = 0;
+ uint32_t ScratchBlocks = 0;
+
+ uint64_t ComputePGMRSrc2 = 0;
+
+ uint32_t NumVGPR = 0;
+ uint32_t NumSGPR = 0;
+ uint32_t LDSSize = 0;
+ bool FlatUsed = false;
+
+ // Number of SGPRs that meets number of waves per execution unit request.
+ uint32_t NumSGPRsForWavesPerEU = 0;
+
+ // Number of VGPRs that meets number of waves per execution unit request.
+ uint32_t NumVGPRsForWavesPerEU = 0;
+
+ // Fixed SGPR number used to hold wave scratch offset for entire kernel
+ // execution, or std::numeric_limits<uint16_t>::max() if the register is not
+ // used or not known.
+ uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR =
+ std::numeric_limits<uint16_t>::max();
+
+ // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
+ // kernel execution, or std::numeric_limits<uint16_t>::max() if the register
+ // is not used or not known.
+ uint16_t DebuggerPrivateSegmentBufferSGPR =
+ std::numeric_limits<uint16_t>::max();
+
+ // Whether there is recursion, dynamic allocas, indirect calls or some other
+ // reason there may be statically unknown stack usage.
+ bool DynamicCallStack = false;
+
+ // Bonus information for debugging.
+ bool VCCUsed = false;
+
+ SIProgramInfo() = default;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 65cdc13e03cd..624607f6ea54 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -8,14 +8,16 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief SI implementation of the TargetRegisterInfo class.
+/// SI implementation of the TargetRegisterInfo class.
//
//===----------------------------------------------------------------------===//
#include "SIRegisterInfo.h"
+#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"
@@ -54,7 +56,7 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
cl::ReallyHidden,
cl::init(true));
-SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) :
+SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
AMDGPURegisterInfo(),
SGPRPressureSets(getNumRegPressureSets()),
VGPRPressureSets(getNumRegPressureSets()),
@@ -101,17 +103,10 @@ SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) :
VGPRSetID < NumRegPressureSets);
}
-void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
- MCRegAliasIterator R(Reg, this, true);
-
- for (; R.isValid(); ++R)
- Reserved.set(*R);
-}
-
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
const MachineFunction &MF) const {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
@@ -136,7 +131,7 @@ static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
return AMDGPU::SGPR_32RegClass.getRegister(Reg);
}
@@ -163,6 +158,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
+ // Reserve xnack_mask registers - support is not implemented in Codegen.
+ reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
+
// Reserve Trap Handler registers - support is not implemented in Codegen.
reserveRegisterTuples(Reserved, AMDGPU::TBA);
reserveRegisterTuples(Reserved, AMDGPU::TMA);
@@ -175,7 +173,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
@@ -255,7 +253,7 @@ bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
// create a virtual register for it during frame index elimination, so the
// scavenger is directly needed.
return MF.getFrameInfo().hasStackObjects() &&
- MF.getSubtarget<SISubtarget>().hasScalarStores() &&
+ MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
}
@@ -310,7 +308,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
DL = Ins->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = Subtarget.getInstrInfo();
if (Offset == 0) {
@@ -339,7 +337,7 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
MachineBasicBlock *MBB = MI.getParent();
MachineFunction *MF = MBB->getParent();
- const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = Subtarget.getInstrInfo();
#ifndef NDEBUG
@@ -526,7 +524,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
RegScavenger *RS) const {
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MI->getParent()->getParent();
- const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const MachineFrameInfo &MFI = MF->getFrameInfo();
@@ -534,22 +532,29 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
const DebugLoc &DL = MI->getDebugLoc();
bool IsStore = Desc.mayStore();
- bool RanOutOfSGPRs = false;
bool Scavenged = false;
unsigned SOffset = ScratchOffsetReg;
+ const unsigned EltSize = 4;
const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
- unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32;
- unsigned Size = NumSubRegs * 4;
+ unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
+ unsigned Size = NumSubRegs * EltSize;
int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
- const int64_t OriginalImmOffset = Offset;
+ int64_t ScratchOffsetRegDelta = 0;
unsigned Align = MFI.getObjectAlignment(Index);
const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
- if (!isUInt<12>(Offset + Size)) {
+ assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
+
+ if (!isUInt<12>(Offset + Size - EltSize)) {
SOffset = AMDGPU::NoRegister;
+ // We currently only support spilling VGPRs to EltSize boundaries, meaning
+ // we can simplify the adjustment of Offset here to just scale with
+ // WavefrontSize.
+ Offset *= ST.getWavefrontSize();
+
// We don't have access to the register scavenger if this function is called
// during PEI::scavengeFrameVirtualRegs().
if (RS)
@@ -563,8 +568,8 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
// add the offset directly to the ScratchOffset register, and then
// subtract the offset after the spill to return ScratchOffset to it's
// original value.
- RanOutOfSGPRs = true;
SOffset = ScratchOffsetReg;
+ ScratchOffsetRegDelta = Offset;
} else {
Scavenged = true;
}
@@ -576,8 +581,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
Offset = 0;
}
- const unsigned EltSize = 4;
-
for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
unsigned SubReg = NumSubRegs == 1 ?
ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
@@ -609,11 +612,11 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
}
- if (RanOutOfSGPRs) {
+ if (ScratchOffsetRegDelta != 0) {
// Subtract the offset we added to the ScratchOffset register.
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
- .addReg(ScratchOffsetReg)
- .addImm(OriginalImmOffset);
+ .addReg(ScratchOffsetReg)
+ .addImm(ScratchOffsetRegDelta);
}
}
@@ -640,6 +643,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MBB->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ DenseSet<unsigned> SGPRSpillVGPRDefinedSet;
ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
= MFI->getSGPRToVGPRSpills(Index);
@@ -648,7 +652,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
return false;
MachineRegisterInfo &MRI = MF->getRegInfo();
- const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
unsigned SuperReg = MI->getOperand(0).getReg();
@@ -661,6 +665,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
if (SpillToSMEM && OnlyToVGPR)
return false;
+ assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
+ SuperReg != MFI->getFrameOffsetReg() &&
+ SuperReg != MFI->getScratchWaveOffsetReg()));
+
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
unsigned OffsetReg = AMDGPU::M0;
@@ -736,11 +744,21 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
+ // During SGPR spilling to VGPR, determine if the VGPR is defined. The
+ // only circumstance in which we say it is undefined is when it is the
+ // first spill to this VGPR in the first basic block.
+ bool VGPRDefined = true;
+ if (MBB == &MF->front())
+ VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
+
+ // Mark the "old value of vgpr" input undef only if this is the first sgpr
+ // spill to this specific vgpr in the first basic block.
BuildMI(*MBB, MI, DL,
TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
Spill.VGPR)
.addReg(SubReg, getKillRegState(IsKill))
- .addImm(Spill.Lane);
+ .addImm(Spill.Lane)
+ .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
// FIXME: Since this spills to another register instead of an actual
// frame index, we should delete the frame index when all references to
@@ -812,7 +830,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
return false;
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const DebugLoc &DL = MI->getDebugLoc();
@@ -972,7 +990,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
@@ -1051,8 +1069,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// Convert to an absolute stack address by finding the offset from the
// scratch wave base and scaling by the wave size.
//
- // In an entry function/kernel the stack address is already the absolute
- // address relative to the the scratch wave offset.
+ // In an entry function/kernel the stack address is already the
+ // absolute address relative to the scratch wave offset.
unsigned DiffReg
= MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -1219,6 +1237,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
&AMDGPU::VReg_512RegClass,
&AMDGPU::SReg_512RegClass,
&AMDGPU::SCC_CLASSRegClass,
+ &AMDGPU::Pseudo_SReg_32RegClass,
+ &AMDGPU::Pseudo_SReg_128RegClass,
};
for (const TargetRegisterClass *BaseClass : BaseClasses) {
@@ -1355,7 +1375,7 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
return getCommonSubClass(DefRC, SrcRC) != nullptr;
}
-/// \brief Returns a register that is not used at any point in the function.
+/// Returns a register that is not used at any point in the function.
/// If all registers are used, then this function will return
// AMDGPU::NoRegister.
unsigned
@@ -1483,7 +1503,9 @@ SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
unsigned Reg) const {
- return hasVGPRs(getRegClassForReg(MRI, Reg));
+ const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
+ assert(RC && "Register class for the reg not found");
+ return hasVGPRs(RC);
}
bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
@@ -1510,7 +1532,7 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
@@ -1545,3 +1567,34 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
return Empty;
return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
}
+
+unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
+ // Not a callee saved register.
+ return AMDGPU::SGPR30_SGPR31;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
+ const MachineRegisterInfo &MRI) const {
+ unsigned Size = getRegSizeInBits(MO.getReg(), MRI);
+ const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
+ if (!RB)
+ return nullptr;
+
+ switch (Size) {
+ case 32:
+ return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
+ &AMDGPU::SReg_32_XM0RegClass;
+ case 64:
+ return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
+ &AMDGPU::SReg_64_XEXECRegClass;
+ case 96:
+ return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
+ nullptr;
+ case 128:
+ return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
+ &AMDGPU::SReg_128RegClass;
+ default:
+ llvm_unreachable("not implemented");
+ }
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index bf814b6974a8..5a51b67ca719 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface definition for SIRegisterInfo
+/// Interface definition for SIRegisterInfo
//
//===----------------------------------------------------------------------===//
@@ -16,15 +16,14 @@
#define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
#include "AMDGPURegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
namespace llvm {
+class GCNSubtarget;
class LiveIntervals;
class MachineRegisterInfo;
-class SISubtarget;
class SIMachineFunctionInfo;
class SIRegisterInfo final : public AMDGPURegisterInfo {
@@ -36,11 +35,10 @@ private:
bool SpillSGPRToVGPR;
bool SpillSGPRToSMEM;
- void reserveRegisterTuples(BitVector &, unsigned Reg) const;
void classifyPressureSet(unsigned PSetID, unsigned Reg,
BitVector &PressureSets) const;
public:
- SIRegisterInfo(const SISubtarget &ST);
+ SIRegisterInfo(const GCNSubtarget &ST);
bool spillSGPRToVGPR() const {
return SpillSGPRToVGPR;
@@ -126,7 +124,7 @@ public:
return getEncodingValue(Reg) & 0xff;
}
- /// \brief Return the 'base' register class for this register.
+ /// Return the 'base' register class for this register.
/// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
@@ -224,10 +222,11 @@ public:
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
- unsigned getReturnAddressReg(const MachineFunction &MF) const {
- // Not a callee saved register.
- return AMDGPU::SGPR30_SGPR31;
- }
+ unsigned getReturnAddressReg(const MachineFunction &MF) const;
+
+ const TargetRegisterClass *
+ getConstrainedRegClassForOperand(const MachineOperand &MO,
+ const MachineRegisterInfo &MRI) const override;
private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI,
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index dd0efef7f91b..f87a0763b353 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -76,6 +76,16 @@ def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>;
def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>;
def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>;
+def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>;
+def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>;
+
+def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
+ DwarfRegAlias<XNACK_MASK_LO> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = 104;
+}
+
// Trap handler registers
def TBA_LO : SIReg<"tba_lo", 108>;
def TBA_HI : SIReg<"tba_hi", 109>;
@@ -394,7 +404,7 @@ def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16],
let CopyCost = -1;
}
-def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32,
+def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
(add PRIVATE_RSRC_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
@@ -403,7 +413,7 @@ def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32,
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
+ (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
let AllocationPriority = 7;
@@ -425,22 +435,22 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
let AllocationPriority = 7;
}
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
let CopyCost = 1;
let AllocationPriority = 8;
}
-def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> {
+def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> {
let isAllocatable = 0;
}
-def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
- (add SGPR_64, VCC, FLAT_SCR, TTMP_64, TBA, TMA)> {
+def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
+ (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
let CopyCost = 1;
let AllocationPriority = 8;
}
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
(add SReg_64_XEXEC, EXEC)> {
let CopyCost = 1;
let AllocationPriority = 8;
@@ -457,7 +467,7 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R
let isAllocatable = 0;
}
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32,
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64, v2f64], 32,
(add SGPR_128, TTMP_128)> {
let AllocationPriority = 10;
}
@@ -495,7 +505,7 @@ def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
}
// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, (add VGPR_64)> {
let Size = 64;
// Requires 2 v_mov_b32 to copy
diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td
index 0f02f5825cb0..7af69cb6a46d 100644
--- a/lib/Target/AMDGPU/SISchedule.td
+++ b/lib/Target/AMDGPU/SISchedule.td
@@ -46,7 +46,7 @@ def Write64Bit : SchedWrite;
// instructions)
class SISchedMachineModel : SchedMachineModel {
- let CompleteModel = 1;
+ let CompleteModel = 0;
// MicroOpBufferSize = 1 means that instructions will always be added
// the ready queue when they become available. This exposes them
// to the register pressure analysis.
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 41f989ad3228..4189bcce52ea 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -10,9 +10,9 @@
//
#include "AMDGPU.h"
-#include "AMDGPUMCInstLower.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -64,17 +64,6 @@ FunctionPass *llvm::createSIShrinkInstructionsPass() {
return new SIShrinkInstructions();
}
-static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
- const MachineRegisterInfo &MRI) {
- if (!MO->isReg())
- return false;
-
- if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
- return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
-
- return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
-}
-
static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
const SIRegisterInfo &TRI,
const MachineRegisterInfo &MRI) {
@@ -92,14 +81,18 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
case AMDGPU::V_ADDC_U32_e64:
case AMDGPU::V_SUBB_U32_e64:
- if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm())
+ case AMDGPU::V_SUBBREV_U32_e64: {
+ const MachineOperand *Src1
+ = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()))
return false;
// Additional verification is needed for sdst/src2.
return true;
-
+ }
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_F16_e64:
- if (!isVGPR(Src2, TRI, MRI) ||
+ case AMDGPU::V_FMAC_F32_e64:
+ if (!Src2->isReg() || !TRI.isVGPR(MRI, Src2->getReg()) ||
TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
return false;
break;
@@ -110,7 +103,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
}
const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- if (Src1 && (!isVGPR(Src1, TRI, MRI) ||
+ if (Src1 && (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()) ||
TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
return false;
@@ -124,7 +117,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
!TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
}
-/// \brief This function checks \p MI for operands defined by a move immediate
+/// This function checks \p MI for operands defined by a move immediate
/// instruction and then folds the literal constant into the instruction if it
/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
@@ -290,7 +283,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
return false;
MachineRegisterInfo &MRI = MF.getRegInfo();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
@@ -442,7 +435,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
//
// So, instead of forcing the instruction to write to VCC, we provide
- // a hint to the register allocator to use VCC and then we we will run
+ // a hint to the register allocator to use VCC and then we will run
// this pass again after RA and shrink it if it outputs to VCC.
MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
continue;
@@ -493,7 +486,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
}
// We can shrink this instruction
- DEBUG(dbgs() << "Shrinking " << MI);
+ LLVM_DEBUG(dbgs() << "Shrinking " << MI);
MachineInstrBuilder Inst32 =
BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
@@ -537,9 +530,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MI.eraseFromParent();
foldImmediates(*Inst32, TII, MRI);
- DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
-
-
+ LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
}
}
return false;
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 53aefe829737..879726b1528c 100644
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This pass adds instructions to enable whole quad mode for pixel
+/// This pass adds instructions to enable whole quad mode for pixel
/// shaders, and whole wavefront mode for all programs.
///
/// Whole quad mode is required for derivative computations, but it interferes
@@ -60,6 +60,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallVector.h"
@@ -325,9 +326,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
unsigned Opcode = MI.getOpcode();
char Flags = 0;
- if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
- Flags = StateWQM;
- } else if (TII->isWQM(Opcode)) {
+ if (TII->isWQM(Opcode)) {
// Sampling instructions don't need to produce results for all pixels
// in a quad, they just require all inputs of a quad to have been
// computed for derivatives.
@@ -454,6 +453,11 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
if (II.Needs != 0)
markInstructionUses(MI, II.Needs, Worklist);
+
+ // Ensure we process a block containing WWM, even if it does not require any
+ // WQM transitions.
+ if (II.Needs & StateWWM)
+ BI.Needs |= StateWWM;
}
void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -681,7 +685,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
return;
- DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n");
+ LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
+ << ":\n");
unsigned SavedWQMReg = 0;
unsigned SavedNonWWMReg = 0;
@@ -844,7 +849,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LowerToCopyInstrs.clear();
CallingConv = MF.getFunction().getCallingConv();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
@@ -884,7 +889,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
}
}
- DEBUG(printInfo());
+ LLVM_DEBUG(printInfo());
lowerCopyInstrs();
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 8f347986eb8a..7485326017b2 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -63,6 +63,18 @@ class SM_Real <SM_Pseudo ps>
bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0);
}
+class SM_Probe_Pseudo <string opName, dag ins, bit isImm>
+ : SM_Pseudo<opName, (outs), ins, " $sdata, $sbase, $offset"> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let has_glc = 0;
+ let LGKM_CNT = 0;
+ let ScalarStore = 0;
+ let hasSideEffects = 1;
+ let offset_is_imm = isImm;
+ let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR");
+}
+
class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]>
: SM_Pseudo<opName, outs, ins, asmOps, pattern> {
RegisterClass BaseClass;
@@ -81,6 +93,18 @@ class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern
let ScalarStore = 1;
}
+class SM_Discard_Pseudo <string opName, dag ins, bit isImm>
+ : SM_Pseudo<opName, (outs), ins, " $sbase, $offset"> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let has_glc = 0;
+ let has_sdst = 0;
+ let ScalarStore = 0;
+ let hasSideEffects = 1;
+ let offset_is_imm = isImm;
+ let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR");
+}
+
multiclass SM_Pseudo_Loads<string opName,
RegisterClass baseClass,
RegisterClass dstClass> {
@@ -125,6 +149,11 @@ multiclass SM_Pseudo_Stores<string opName,
}
}
+multiclass SM_Pseudo_Discards<string opName> {
+ def _IMM : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smrd_offset_20:$offset), 1>;
+ def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>;
+}
+
class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
opName, (outs SReg_64_XEXEC:$sdst), (ins),
" $sdst", [(set i64:$sdst, (node))]> {
@@ -144,6 +173,60 @@ class SM_Inval_Pseudo <string opName, SDPatternOperator node> : SM_Pseudo<
let has_offset = 0;
}
+multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> {
+ def _IMM : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smrd_offset_20:$offset), 1>;
+ def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>;
+}
+
+//===----------------------------------------------------------------------===//
+// Scalar Atomic Memory Classes
+//===----------------------------------------------------------------------===//
+
+class SM_Atomic_Pseudo <string opName,
+ dag outs, dag ins, string asmOps, bit isRet>
+ : SM_Pseudo<opName, outs, ins, asmOps, []> {
+
+ bit glc = isRet;
+
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_glc = 1;
+
+ // Should these be set?
+ let ScalarStore = 1;
+ let hasSideEffects = 1;
+ let maybeAtomic = 1;
+}
+
+class SM_Pseudo_Atomic<string opName,
+ RegisterClass baseClass,
+ RegisterClass dataClass,
+ bit isImm,
+ bit isRet> :
+ SM_Atomic_Pseudo<opName,
+ !if(isRet, (outs dataClass:$sdst), (outs)),
+ !if(isImm,
+ (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset),
+ (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset)),
+ !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", ""),
+ isRet> {
+ let offset_is_imm = isImm;
+ let PseudoInstr = opName # !if(isImm,
+ !if(isRet, "_IMM_RTN", "_IMM"),
+ !if(isRet, "_SGPR_RTN", "_SGPR"));
+
+ let Constraints = !if(isRet, "$sdst = $sdata", "");
+ let DisableEncoding = !if(isRet, "$sdata", "");
+}
+
+multiclass SM_Pseudo_Atomics<string opName,
+ RegisterClass baseClass,
+ RegisterClass dataClass> {
+ def _IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 0>;
+ def _SGPR : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 0>;
+ def _IMM_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 1>;
+ def _SGPR_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 1>;
+}
//===----------------------------------------------------------------------===//
// Scalar Memory Instructions
@@ -211,9 +294,85 @@ let SubtargetPredicate = isVI in {
def S_DCACHE_WB : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>;
def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
-} // SubtargetPredicate = isVI
+defm S_ATC_PROBE : SM_Pseudo_Probe <"s_atc_probe", SReg_64>;
+defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>;
+} // SubtargetPredicate = isVI
+let SubtargetPredicate = HasFlatScratchInsts, Uses = [FLAT_SCR] in {
+defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <"s_scratch_load_dword", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_SCRATCH_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_scratch_load_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_64, SReg_128>;
+
+defm S_SCRATCH_STORE_DWORD : SM_Pseudo_Stores <"s_scratch_store_dword", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_SCRATCH_STORE_DWORDX2 : SM_Pseudo_Stores <"s_scratch_store_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg_64, SReg_128>;
+} // SubtargetPredicate = HasFlatScratchInsts
+
+let SubtargetPredicate = HasScalarAtomics in {
+
+defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_buffer_atomic_swap", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics <"s_buffer_atomic_add", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SUB : SM_Pseudo_Atomics <"s_buffer_atomic_sub", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SMIN : SM_Pseudo_Atomics <"s_buffer_atomic_smin", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_UMIN : SM_Pseudo_Atomics <"s_buffer_atomic_umin", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SMAX : SM_Pseudo_Atomics <"s_buffer_atomic_smax", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_UMAX : SM_Pseudo_Atomics <"s_buffer_atomic_umax", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_AND : SM_Pseudo_Atomics <"s_buffer_atomic_and", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_OR : SM_Pseudo_Atomics <"s_buffer_atomic_or", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_XOR : SM_Pseudo_Atomics <"s_buffer_atomic_xor", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_INC : SM_Pseudo_Atomics <"s_buffer_atomic_inc", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_DEC : SM_Pseudo_Atomics <"s_buffer_atomic_dec", SReg_128, SReg_32_XM0_XEXEC>;
+
+defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_swap_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap_x2", SReg_128, SReg_128>;
+defm S_BUFFER_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_add_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_sub_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_smin_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_umin_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_smax_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_umax_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_AND_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_and_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_or_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_xor_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_inc_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_dec_x2", SReg_128, SReg_64_XEXEC>;
+
+defm S_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_atomic_swap", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_atomic_cmpswap", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_ADD : SM_Pseudo_Atomics <"s_atomic_add", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SUB : SM_Pseudo_Atomics <"s_atomic_sub", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SMIN : SM_Pseudo_Atomics <"s_atomic_smin", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_UMIN : SM_Pseudo_Atomics <"s_atomic_umin", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SMAX : SM_Pseudo_Atomics <"s_atomic_smax", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_UMAX : SM_Pseudo_Atomics <"s_atomic_umax", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_AND : SM_Pseudo_Atomics <"s_atomic_and", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_OR : SM_Pseudo_Atomics <"s_atomic_or", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_XOR : SM_Pseudo_Atomics <"s_atomic_xor", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_INC : SM_Pseudo_Atomics <"s_atomic_inc", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_DEC : SM_Pseudo_Atomics <"s_atomic_dec", SReg_64, SReg_32_XM0_XEXEC>;
+
+defm S_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <"s_atomic_swap_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <"s_atomic_cmpswap_x2", SReg_64, SReg_128>;
+defm S_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <"s_atomic_add_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <"s_atomic_sub_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <"s_atomic_smin_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <"s_atomic_umin_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <"s_atomic_smax_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <"s_atomic_umax_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_AND_X2 : SM_Pseudo_Atomics <"s_atomic_and_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_OR_X2 : SM_Pseudo_Atomics <"s_atomic_or_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <"s_atomic_xor_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_INC_X2 : SM_Pseudo_Atomics <"s_atomic_inc_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_atomic_dec_x2", SReg_64, SReg_64_XEXEC>;
+
+} // let SubtargetPredicate = HasScalarAtomics
+
+let SubtargetPredicate = isGFX9 in {
+defm S_DCACHE_DISCARD : SM_Pseudo_Discards <"s_dcache_discard">;
+defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">;
+}
//===----------------------------------------------------------------------===//
// Scalar Memory Patterns
@@ -223,11 +382,9 @@ def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>
def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
auto Ld = cast<LoadSDNode>(N);
return Ld->getAlignment() >= 4 &&
- ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
- static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
+ ((((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) || (Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT)) && !N->isDivergent()) ||
(Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
- !Ld->isVolatile() &&
- static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
+ !Ld->isVolatile() && !N->isDivergent() &&
static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
}]>;
@@ -407,6 +564,11 @@ multiclass SM_Real_Stores_vi<bits<8> op, string ps,
}
}
+multiclass SM_Real_Probe_vi<bits<8> op, string ps> {
+ def _IMM_vi : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
+ def _SGPR_vi : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>;
+}
+
defm S_LOAD_DWORD : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">;
defm S_LOAD_DWORDX2 : SM_Real_Loads_vi <0x01, "S_LOAD_DWORDX2">;
defm S_LOAD_DWORDX4 : SM_Real_Loads_vi <0x02, "S_LOAD_DWORDX4">;
@@ -434,6 +596,103 @@ def S_DCACHE_WB_VOL_vi : SMEM_Real_vi <0x23, S_DCACHE_WB_VOL>;
def S_MEMTIME_vi : SMEM_Real_vi <0x24, S_MEMTIME>;
def S_MEMREALTIME_vi : SMEM_Real_vi <0x25, S_MEMREALTIME>;
+defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_vi <0x05, "S_SCRATCH_LOAD_DWORD">;
+defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_vi <0x06, "S_SCRATCH_LOAD_DWORDX2">;
+defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_vi <0x07, "S_SCRATCH_LOAD_DWORDX4">;
+
+defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_vi <0x15, "S_SCRATCH_STORE_DWORD">;
+defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_vi <0x16, "S_SCRATCH_STORE_DWORDX2">;
+defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_vi <0x17, "S_SCRATCH_STORE_DWORDX4">;
+
+defm S_ATC_PROBE : SM_Real_Probe_vi <0x26, "S_ATC_PROBE">;
+defm S_ATC_PROBE_BUFFER : SM_Real_Probe_vi <0x27, "S_ATC_PROBE_BUFFER">;
+
+//===----------------------------------------------------------------------===//
+// GFX9
+//===----------------------------------------------------------------------===//
+
+class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps>
+ : SMEM_Real_vi <op, ps> {
+
+ bits<7> sdata;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ let glc = ps.glc;
+ let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0});
+}
+
+multiclass SM_Real_Atomics_vi<bits<8> op, string ps> {
+ def _IMM_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>;
+ def _SGPR_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>;
+ def _IMM_RTN_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>;
+ def _SGPR_RTN_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>;
+}
+
+defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_vi <0x40, "S_BUFFER_ATOMIC_SWAP">;
+defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x41, "S_BUFFER_ATOMIC_CMPSWAP">;
+defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_vi <0x42, "S_BUFFER_ATOMIC_ADD">;
+defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_vi <0x43, "S_BUFFER_ATOMIC_SUB">;
+defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_vi <0x44, "S_BUFFER_ATOMIC_SMIN">;
+defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_vi <0x45, "S_BUFFER_ATOMIC_UMIN">;
+defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_vi <0x46, "S_BUFFER_ATOMIC_SMAX">;
+defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_vi <0x47, "S_BUFFER_ATOMIC_UMAX">;
+defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_vi <0x48, "S_BUFFER_ATOMIC_AND">;
+defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_vi <0x49, "S_BUFFER_ATOMIC_OR">;
+defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_vi <0x4a, "S_BUFFER_ATOMIC_XOR">;
+defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_vi <0x4b, "S_BUFFER_ATOMIC_INC">;
+defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_vi <0x4c, "S_BUFFER_ATOMIC_DEC">;
+
+defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0x60, "S_BUFFER_ATOMIC_SWAP_X2">;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0x61, "S_BUFFER_ATOMIC_CMPSWAP_X2">;
+defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0x62, "S_BUFFER_ATOMIC_ADD_X2">;
+defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0x63, "S_BUFFER_ATOMIC_SUB_X2">;
+defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0x64, "S_BUFFER_ATOMIC_SMIN_X2">;
+defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0x65, "S_BUFFER_ATOMIC_UMIN_X2">;
+defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0x66, "S_BUFFER_ATOMIC_SMAX_X2">;
+defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0x67, "S_BUFFER_ATOMIC_UMAX_X2">;
+defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0x68, "S_BUFFER_ATOMIC_AND_X2">;
+defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0x69, "S_BUFFER_ATOMIC_OR_X2">;
+defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0x6a, "S_BUFFER_ATOMIC_XOR_X2">;
+defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0x6b, "S_BUFFER_ATOMIC_INC_X2">;
+defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0x6c, "S_BUFFER_ATOMIC_DEC_X2">;
+
+defm S_ATOMIC_SWAP : SM_Real_Atomics_vi <0x80, "S_ATOMIC_SWAP">;
+defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x81, "S_ATOMIC_CMPSWAP">;
+defm S_ATOMIC_ADD : SM_Real_Atomics_vi <0x82, "S_ATOMIC_ADD">;
+defm S_ATOMIC_SUB : SM_Real_Atomics_vi <0x83, "S_ATOMIC_SUB">;
+defm S_ATOMIC_SMIN : SM_Real_Atomics_vi <0x84, "S_ATOMIC_SMIN">;
+defm S_ATOMIC_UMIN : SM_Real_Atomics_vi <0x85, "S_ATOMIC_UMIN">;
+defm S_ATOMIC_SMAX : SM_Real_Atomics_vi <0x86, "S_ATOMIC_SMAX">;
+defm S_ATOMIC_UMAX : SM_Real_Atomics_vi <0x87, "S_ATOMIC_UMAX">;
+defm S_ATOMIC_AND : SM_Real_Atomics_vi <0x88, "S_ATOMIC_AND">;
+defm S_ATOMIC_OR : SM_Real_Atomics_vi <0x89, "S_ATOMIC_OR">;
+defm S_ATOMIC_XOR : SM_Real_Atomics_vi <0x8a, "S_ATOMIC_XOR">;
+defm S_ATOMIC_INC : SM_Real_Atomics_vi <0x8b, "S_ATOMIC_INC">;
+defm S_ATOMIC_DEC : SM_Real_Atomics_vi <0x8c, "S_ATOMIC_DEC">;
+
+defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0xa0, "S_ATOMIC_SWAP_X2">;
+defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0xa1, "S_ATOMIC_CMPSWAP_X2">;
+defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0xa2, "S_ATOMIC_ADD_X2">;
+defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0xa3, "S_ATOMIC_SUB_X2">;
+defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0xa4, "S_ATOMIC_SMIN_X2">;
+defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0xa5, "S_ATOMIC_UMIN_X2">;
+defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0xa6, "S_ATOMIC_SMAX_X2">;
+defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0xa7, "S_ATOMIC_UMAX_X2">;
+defm S_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0xa8, "S_ATOMIC_AND_X2">;
+defm S_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0xa9, "S_ATOMIC_OR_X2">;
+defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0xaa, "S_ATOMIC_XOR_X2">;
+defm S_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0xab, "S_ATOMIC_INC_X2">;
+defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0xac, "S_ATOMIC_DEC_X2">;
+
+multiclass SM_Real_Discard_vi<bits<8> op, string ps> {
+ def _IMM_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_IMM)>;
+ def _SGPR_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>;
+}
+
+defm S_DCACHE_DISCARD : SM_Real_Discard_vi <0x28, "S_DCACHE_DISCARD">;
+defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_vi <0x29, "S_DCACHE_DISCARD_X2">;
//===----------------------------------------------------------------------===//
// CI
@@ -502,7 +761,7 @@ let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in {
class SMRD_Pattern_ci <string Instr, ValueType vt> : GCNPat <
(smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
+ (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
let OtherPredicates = [isCIOnly];
}
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index 02a95a4b6f24..6f5db9644c86 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -19,17 +19,28 @@ def GPRIdxMode : Operand<i32> {
let OperandType = "OPERAND_IMMEDIATE";
}
+class SOP_Pseudo<string opName, dag outs, dag ins, string asmOps,
+ list<dag> pattern=[]> :
+ InstSI<outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let SubtargetPredicate = isGCN;
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ bits<1> has_sdst = 0;
+}
+
//===----------------------------------------------------------------------===//
// SOP1 Instructions
//===----------------------------------------------------------------------===//
class SOP1_Pseudo <string opName, dag outs, dag ins,
string asmOps, list<dag> pattern=[]> :
- InstSI <outs, ins, "", pattern>,
- SIMCInstr<opName, SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let SubtargetPredicate = isGCN;
+ SOP_Pseudo<opName, outs, ins, asmOps, pattern> {
let mayLoad = 0;
let mayStore = 0;
@@ -40,9 +51,6 @@ class SOP1_Pseudo <string opName, dag outs, dag ins,
let Size = 4;
let UseNamedOperandTable = 1;
- string Mnemonic = opName;
- string AsmOperands = asmOps;
-
bits<1> has_src0 = 1;
bits<1> has_sdst = 1;
}
@@ -247,17 +255,25 @@ def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> {
}
}
+let SubtargetPredicate = isGFX9 in {
+ let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in {
+ def S_ANDN1_SAVEEXEC_B64 : SOP1_64<"s_andn1_saveexec_b64">;
+ def S_ORN1_SAVEEXEC_B64 : SOP1_64<"s_orn1_saveexec_b64">;
+ def S_ANDN1_WREXEC_B64 : SOP1_64<"s_andn1_wrexec_b64">;
+ def S_ANDN2_WREXEC_B64 : SOP1_64<"s_andn2_wrexec_b64">;
+ } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
+
+ def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
+} // End SubtargetPredicate = isGFX9
+
//===----------------------------------------------------------------------===//
// SOP2 Instructions
//===----------------------------------------------------------------------===//
class SOP2_Pseudo<string opName, dag outs, dag ins,
string asmOps, list<dag> pattern=[]> :
- InstSI<outs, ins, "", pattern>,
- SIMCInstr<opName, SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let SubtargetPredicate = isGCN;
+ SOP_Pseudo<opName, outs, ins, asmOps, pattern> {
+
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -266,10 +282,7 @@ class SOP2_Pseudo<string opName, dag outs, dag ins,
let SchedRW = [WriteSALU];
let UseNamedOperandTable = 1;
- string Mnemonic = opName;
- string AsmOperands = asmOps;
-
- bits<1> has_sdst = 1;
+ let has_sdst = 1;
// Pseudo instructions have no encodings, but adding this field here allows
// us to do:
@@ -279,7 +292,7 @@ class SOP2_Pseudo<string opName, dag outs, dag ins,
// let Size = 4; // Do we need size here?
}
-class SOP2_Real<bits<7> op, SOP2_Pseudo ps> :
+class SOP2_Real<bits<7> op, SOP_Pseudo ps> :
InstSI <ps.OutOperandList, ps.InOperandList,
ps.Mnemonic # " " # ps.AsmOperands, []>,
Enc32 {
@@ -482,6 +495,16 @@ let SubtargetPredicate = isGFX9 in {
def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">;
def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">;
def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
+
+ let Defs = [SCC] in {
+ def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32">;
+ def S_LSHL2_ADD_U32 : SOP2_32<"s_lshl2_add_u32">;
+ def S_LSHL3_ADD_U32 : SOP2_32<"s_lshl3_add_u32">;
+ def S_LSHL4_ADD_U32 : SOP2_32<"s_lshl4_add_u32">;
+ } // End Defs = [SCC]
+
+ def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32">;
+ def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32">;
}
//===----------------------------------------------------------------------===//
@@ -659,6 +682,16 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo <
} // End hasSideEffects = 1
+let SubtargetPredicate = isGFX9 in {
+ def S_CALL_B64 : SOPK_Pseudo<
+ "s_call_b64",
+ (outs SReg_64:$sdst),
+ (ins s16imm:$simm16),
+ "$sdst, $simm16"> {
+ let isCall = 1;
+ }
+}
+
//===----------------------------------------------------------------------===//
// SOPC Instructions
//===----------------------------------------------------------------------===//
@@ -806,6 +839,13 @@ def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> {
}
}
+let SubtargetPredicate = isGFX9 in {
+ let isBarrier = 1, isReturn = 1, simm16 = 0 in {
+ def S_ENDPGM_ORDERED_PS_DONE :
+ SOPP<0x01e, (ins), "s_endpgm_ordered_ps_done">;
+ } // End isBarrier = 1, isReturn = 1, simm16 = 0
+} // End SubtargetPredicate = isGFX9
+
let isBranch = 1, SchedRW = [WriteBranch] in {
def S_BRANCH : SOPP <
0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
@@ -1312,3 +1352,26 @@ def S_SETREG_B32_vi : SOPK_Real_vi <0x12, S_SETREG_B32>;
//def S_GETREG_REGRD_B32_vi : SOPK_Real_vi <0x13, S_GETREG_REGRD_B32>; // see pseudo for comments
def S_SETREG_IMM32_B32_vi : SOPK_Real64<0x14, S_SETREG_IMM32_B32>,
Select_vi<S_SETREG_IMM32_B32.Mnemonic>;
+
+def S_CALL_B64_vi : SOPK_Real_vi <0x15, S_CALL_B64>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 - GFX9.
+//===----------------------------------------------------------------------===//
+
+def S_ANDN1_SAVEEXEC_B64_vi : SOP1_Real_vi<0x33, S_ANDN1_SAVEEXEC_B64>;
+def S_ORN1_SAVEEXEC_B64_vi : SOP1_Real_vi<0x34, S_ORN1_SAVEEXEC_B64>;
+def S_ANDN1_WREXEC_B64_vi : SOP1_Real_vi<0x35, S_ANDN1_WREXEC_B64>;
+def S_ANDN2_WREXEC_B64_vi : SOP1_Real_vi<0x36, S_ANDN2_WREXEC_B64>;
+def S_BITREPLICATE_B64_B32_vi : SOP1_Real_vi<0x37, S_BITREPLICATE_B64_B32>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX9.
+//===----------------------------------------------------------------------===//
+
+def S_LSHL1_ADD_U32_vi : SOP2_Real_vi<0x2e, S_LSHL1_ADD_U32>;
+def S_LSHL2_ADD_U32_vi : SOP2_Real_vi<0x2f, S_LSHL2_ADD_U32>;
+def S_LSHL3_ADD_U32_vi : SOP2_Real_vi<0x30, S_LSHL3_ADD_U32>;
+def S_LSHL4_ADD_U32_vi : SOP2_Real_vi<0x31, S_LSHL4_ADD_U32>;
+def S_MUL_HI_U32_vi : SOP2_Real_vi<0x2c, S_MUL_HI_U32>;
+def S_MUL_HI_I32_vi : SOP2_Real_vi<0x2d, S_MUL_HI_I32>;
diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index f61e2e413ad4..e4c442db3016 100644
--- a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -16,19 +16,19 @@
using namespace llvm;
-/// \brief The target which supports all AMD GPUs. This will eventually
+/// The target which supports all AMD GPUs. This will eventually
/// be deprecated and there will be a R600 target and a GCN target.
Target &llvm::getTheAMDGPUTarget() {
static Target TheAMDGPUTarget;
return TheAMDGPUTarget;
}
-/// \brief The target for GCN GPUs
+/// The target for GCN GPUs
Target &llvm::getTheGCNTarget() {
static Target TheGCNTarget;
return TheGCNTarget;
}
-/// \brief Extern function to initialize the targets for the AMDGPU backend
+/// Extern function to initialize the targets for the AMDGPU backend
extern "C" void LLVMInitializeAMDGPUTargetInfo() {
RegisterTarget<Triple::r600, false> R600(getTheAMDGPUTarget(), "r600",
"AMD GPUs HD2XXX-HD6XXX", "AMDGPU");
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 03b11ae80500..9eb4c6513cce 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -61,7 +61,15 @@ const char* const IdSymbolic[] = {
"HW_REG_HW_ID",
"HW_REG_GPR_ALLOC",
"HW_REG_LDS_ALLOC",
- "HW_REG_IB_STS"
+ "HW_REG_IB_STS",
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ "HW_REG_SH_MEM_BASES"
};
} // namespace Hwreg
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 125a3b22d0cf..3fd3c75874a3 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -8,6 +8,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUBaseInfo.h"
+#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPU.h"
#include "SIDefines.h"
#include "llvm/ADT/StringRef.h"
@@ -52,7 +53,7 @@ unsigned getBitMask(unsigned Shift, unsigned Width) {
return ((1 << Width) - 1) << Shift;
}
-/// \brief Packs \p Src into \p Dst for given bit \p Shift and bit \p Width.
+/// Packs \p Src into \p Dst for given bit \p Shift and bit \p Width.
///
/// \returns Packed \p Dst.
unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
@@ -61,7 +62,7 @@ unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
return Dst;
}
-/// \brief Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
+/// Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
///
/// \returns Unpacked bits.
unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
@@ -96,64 +97,34 @@ unsigned getVmcntBitWidthHi() { return 2; }
namespace llvm {
-static cl::opt<bool> EnablePackedInlinableLiterals(
- "enable-packed-inlinable-literals",
- cl::desc("Enable packed inlinable literals (v2f16, v2i16)"),
- cl::init(false));
-
namespace AMDGPU {
-LLVM_READNONE
-static inline Channels indexToChannel(unsigned Channel) {
- switch (Channel) {
- case 1:
- return AMDGPU::Channels_1;
- case 2:
- return AMDGPU::Channels_2;
- case 3:
- return AMDGPU::Channels_3;
- case 4:
- return AMDGPU::Channels_4;
- default:
- llvm_unreachable("invalid MIMG channel");
- }
-}
+struct MIMGInfo {
+ uint16_t Opcode;
+ uint16_t BaseOpcode;
+ uint8_t MIMGEncoding;
+ uint8_t VDataDwords;
+ uint8_t VAddrDwords;
+};
+#define GET_MIMGBaseOpcodesTable_IMPL
+#define GET_MIMGDimInfoTable_IMPL
+#define GET_MIMGInfoTable_IMPL
+#include "AMDGPUGenSearchableTables.inc"
-// FIXME: Need to handle d16 images correctly.
-static unsigned rcToChannels(unsigned RCID) {
- switch (RCID) {
- case AMDGPU::VGPR_32RegClassID:
- return 1;
- case AMDGPU::VReg_64RegClassID:
- return 2;
- case AMDGPU::VReg_96RegClassID:
- return 3;
- case AMDGPU::VReg_128RegClassID:
- return 4;
- default:
- llvm_unreachable("invalid MIMG register class");
- }
+int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
+ unsigned VDataDwords, unsigned VAddrDwords) {
+ const MIMGInfo *Info = getMIMGOpcodeHelper(BaseOpcode, MIMGEncoding,
+ VDataDwords, VAddrDwords);
+ return Info ? Info->Opcode : -1;
}
-int getMaskedMIMGOp(const MCInstrInfo &MII, unsigned Opc, unsigned NewChannels) {
- AMDGPU::Channels Channel = AMDGPU::indexToChannel(NewChannels);
- unsigned OrigChannels = rcToChannels(MII.get(Opc).OpInfo[0].RegClass);
- if (NewChannels == OrigChannels)
- return Opc;
-
- switch (OrigChannels) {
- case 1:
- return AMDGPU::getMaskedMIMGOp1(Opc, Channel);
- case 2:
- return AMDGPU::getMaskedMIMGOp2(Opc, Channel);
- case 3:
- return AMDGPU::getMaskedMIMGOp3(Opc, Channel);
- case 4:
- return AMDGPU::getMaskedMIMGOp4(Opc, Channel);
- default:
- llvm_unreachable("invalid MIMG channel");
- }
+int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
+ const MIMGInfo *OrigInfo = getMIMGInfo(Opc);
+ const MIMGInfo *NewInfo =
+ getMIMGOpcodeHelper(OrigInfo->BaseOpcode, OrigInfo->MIMGEncoding,
+ NewChannels, OrigInfo->VAddrDwords);
+ return NewInfo ? NewInfo->Opcode : -1;
}
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
@@ -183,10 +154,10 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
return {7, 0, 3};
if (Features.test(FeatureISAVersion7_0_4))
return {7, 0, 4};
+ if (Features.test(FeatureSeaIslands))
+ return {7, 0, 0};
// GCN GFX8 (Volcanic Islands (VI)).
- if (Features.test(FeatureISAVersion8_0_0))
- return {8, 0, 0};
if (Features.test(FeatureISAVersion8_0_1))
return {8, 0, 1};
if (Features.test(FeatureISAVersion8_0_2))
@@ -195,14 +166,22 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
return {8, 0, 3};
if (Features.test(FeatureISAVersion8_1_0))
return {8, 1, 0};
+ if (Features.test(FeatureVolcanicIslands))
+ return {8, 0, 0};
// GCN GFX9.
if (Features.test(FeatureISAVersion9_0_0))
return {9, 0, 0};
if (Features.test(FeatureISAVersion9_0_2))
return {9, 0, 2};
+ if (Features.test(FeatureISAVersion9_0_4))
+ return {9, 0, 4};
+ if (Features.test(FeatureISAVersion9_0_6))
+ return {9, 0, 6};
+ if (Features.test(FeatureGFX9))
+ return {9, 0, 0};
- if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands))
+ if (Features.test(FeatureSouthernIslands))
return {0, 0, 0};
return {7, 0, 0};
}
@@ -219,11 +198,15 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
<< ISAVersion.Major
<< ISAVersion.Minor
<< ISAVersion.Stepping;
+
+ if (hasXNACK(*STI))
+ Stream << "+xnack";
+
Stream.flush();
}
-bool hasCodeObjectV3(const FeatureBitset &Features) {
- return Features.test(FeatureCodeObjectV3);
+bool hasCodeObjectV3(const MCSubtargetInfo *STI) {
+ return STI->getFeatureBits().test(FeatureCodeObjectV3);
}
unsigned getWavefrontSize(const FeatureBitset &Features) {
@@ -260,7 +243,7 @@ unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
}
unsigned getMaxWavesPerCU(const FeatureBitset &Features) {
- return getMaxWavesPerEU(Features) * getEUsPerCU(Features);
+ return getMaxWavesPerEU() * getEUsPerCU(Features);
}
unsigned getMaxWavesPerCU(const FeatureBitset &Features,
@@ -272,9 +255,7 @@ unsigned getMinWavesPerEU(const FeatureBitset &Features) {
return 1;
}
-unsigned getMaxWavesPerEU(const FeatureBitset &Features) {
- if (!Features.test(FeatureGCN))
- return 8;
+unsigned getMaxWavesPerEU() {
// FIXME: Need to take scratch memory into account.
return 10;
}
@@ -330,11 +311,13 @@ unsigned getAddressableNumSGPRs(const FeatureBitset &Features) {
unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
assert(WavesPerEU != 0);
- if (WavesPerEU >= getMaxWavesPerEU(Features))
+ if (WavesPerEU >= getMaxWavesPerEU())
return 0;
- unsigned MinNumSGPRs =
- alignDown(getTotalNumSGPRs(Features) / (WavesPerEU + 1),
- getSGPRAllocGranule(Features)) + 1;
+
+ unsigned MinNumSGPRs = getTotalNumSGPRs(Features) / (WavesPerEU + 1);
+ if (Features.test(FeatureTrapHandler))
+ MinNumSGPRs -= std::min(MinNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
+ MinNumSGPRs = alignDown(MinNumSGPRs, getSGPRAllocGranule(Features)) + 1;
return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features));
}
@@ -343,14 +326,49 @@ unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
assert(WavesPerEU != 0);
IsaVersion Version = getIsaVersion(Features);
- unsigned MaxNumSGPRs = alignDown(getTotalNumSGPRs(Features) / WavesPerEU,
- getSGPRAllocGranule(Features));
unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features);
if (Version.Major >= 8 && !Addressable)
AddressableNumSGPRs = 112;
+ unsigned MaxNumSGPRs = getTotalNumSGPRs(Features) / WavesPerEU;
+ if (Features.test(FeatureTrapHandler))
+ MaxNumSGPRs -= std::min(MaxNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
+ MaxNumSGPRs = alignDown(MaxNumSGPRs, getSGPRAllocGranule(Features));
return std::min(MaxNumSGPRs, AddressableNumSGPRs);
}
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+ bool FlatScrUsed, bool XNACKUsed) {
+ unsigned ExtraSGPRs = 0;
+ if (VCCUsed)
+ ExtraSGPRs = 2;
+
+ IsaVersion Version = getIsaVersion(Features);
+ if (Version.Major < 8) {
+ if (FlatScrUsed)
+ ExtraSGPRs = 4;
+ } else {
+ if (XNACKUsed)
+ ExtraSGPRs = 4;
+
+ if (FlatScrUsed)
+ ExtraSGPRs = 6;
+ }
+
+ return ExtraSGPRs;
+}
+
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+ bool FlatScrUsed) {
+ return getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed,
+ Features[AMDGPU::FeatureXNACK]);
+}
+
+unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs) {
+ NumSGPRs = alignTo(std::max(1u, NumSGPRs), getSGPREncodingGranule(Features));
+ // SGPRBlocks is actual number of SGPR blocks minus 1.
+ return NumSGPRs / getSGPREncodingGranule(Features) - 1;
+}
+
unsigned getVGPRAllocGranule(const FeatureBitset &Features) {
return 4;
}
@@ -370,7 +388,7 @@ unsigned getAddressableNumVGPRs(const FeatureBitset &Features) {
unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
assert(WavesPerEU != 0);
- if (WavesPerEU >= getMaxWavesPerEU(Features))
+ if (WavesPerEU >= getMaxWavesPerEU())
return 0;
unsigned MinNumVGPRs =
alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1),
@@ -387,6 +405,12 @@ unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
return std::min(MaxNumVGPRs, AddressableNumVGPRs);
}
+unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumVGPRs) {
+ NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(Features));
+ // VGPRBlocks is actual number of VGPR blocks minus 1.
+ return NumVGPRs / getVGPREncodingGranule(Features) - 1;
+}
+
} // end namespace IsaInfo
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
@@ -396,7 +420,7 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
memset(&Header, 0, sizeof(Header));
Header.amd_kernel_code_version_major = 1;
- Header.amd_kernel_code_version_minor = 1;
+ Header.amd_kernel_code_version_minor = 2;
Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
Header.amd_machine_version_major = ISA.Major;
Header.amd_machine_version_minor = ISA.Minor;
@@ -416,6 +440,21 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
Header.private_segment_alignment = 4;
}
+amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() {
+ amdhsa::kernel_descriptor_t KD;
+ memset(&KD, 0, sizeof(KD));
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,
+ amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, 1);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1);
+ return KD;
+}
+
bool isGroupSegment(const GlobalValue *GV) {
return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}
@@ -425,7 +464,8 @@ bool isGlobalSegment(const GlobalValue *GV) {
}
bool isReadOnlySegment(const GlobalValue *GV) {
- return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+ return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
}
bool shouldEmitConstantsToTextSection(const Triple &TT) {
@@ -598,6 +638,18 @@ bool isEntryFunctionCC(CallingConv::ID CC) {
}
}
+bool hasXNACK(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
+}
+
+bool hasMIMG_R128(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128];
+}
+
+bool hasPackedD16(const MCSubtargetInfo &STI) {
+ return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem];
+}
+
bool isSI(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
}
@@ -681,6 +733,8 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
case node: return isGFX9(STI) ? node##_gfx9 : node##_vi;
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
+ if (STI.getTargetTriple().getArch() == Triple::r600)
+ return Reg;
MAP_REG2REG
}
@@ -837,9 +891,6 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
assert(HasInv2Pi);
- if (!EnablePackedInlinableLiterals)
- return false;
-
int16_t Lo16 = static_cast<int16_t>(Literal);
int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
@@ -871,24 +922,6 @@ bool isArgPassedInSGPR(const Argument *A) {
}
}
-// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
-bool isUniformMMO(const MachineMemOperand *MMO) {
- const Value *Ptr = MMO->getValue();
- // UndefValue means this is a load of a kernel input. These are uniform.
- // Sometimes LDS instructions have constant pointers.
- // If Ptr is null, then that means this mem operand contains a
- // PseudoSourceValue like GOT.
- if (!Ptr || isa<UndefValue>(Ptr) ||
- isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
- return true;
-
- if (const Argument *Arg = dyn_cast<Argument>(Ptr))
- return isArgPassedInSGPR(Arg);
-
- const Instruction *I = dyn_cast<Instruction>(Ptr);
- return I && I->getMetadata("amdgpu.uniform");
-}
-
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
if (isGCN3Encoding(ST))
return ByteOffset;
@@ -909,18 +942,10 @@ namespace llvm {
namespace AMDGPU {
AMDGPUAS getAMDGPUAS(Triple T) {
- auto Env = T.getEnvironmentName();
AMDGPUAS AS;
- if (Env == "amdgiz" || Env == "amdgizcl") {
- AS.FLAT_ADDRESS = 0;
- AS.PRIVATE_ADDRESS = 5;
- AS.REGION_ADDRESS = 4;
- }
- else {
- AS.FLAT_ADDRESS = 4;
- AS.PRIVATE_ADDRESS = 0;
- AS.REGION_ADDRESS = 5;
- }
+ AS.FLAT_ADDRESS = 0;
+ AS.PRIVATE_ADDRESS = 5;
+ AS.REGION_ADDRESS = 2;
return AS;
}
@@ -931,5 +956,21 @@ AMDGPUAS getAMDGPUAS(const TargetMachine &M) {
AMDGPUAS getAMDGPUAS(const Module &M) {
return getAMDGPUAS(Triple(M.getTargetTriple()));
}
+
+namespace {
+
+struct SourceOfDivergence {
+ unsigned Intr;
+};
+const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr);
+
+#define GET_SourcesOfDivergence_IMPL
+#include "AMDGPUGenSearchableTables.inc"
+
+} // end anonymous namespace
+
+bool isIntrinsicSourceOfDivergence(unsigned IntrID) {
+ return lookupSourceOfDivergence(IntrID);
+}
} // namespace AMDGPU
} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a215b445378e..70681c271697 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -16,6 +16,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include <cstdint>
@@ -28,24 +29,31 @@ class Argument;
class FeatureBitset;
class Function;
class GlobalValue;
-class MachineMemOperand;
class MCContext;
class MCRegisterClass;
class MCRegisterInfo;
class MCSection;
class MCSubtargetInfo;
+class MachineMemOperand;
class Triple;
namespace AMDGPU {
+
+#define GET_MIMGBaseOpcode_DECL
+#define GET_MIMGDim_DECL
+#define GET_MIMGEncoding_DECL
+#include "AMDGPUGenSearchableTables.inc"
+
namespace IsaInfo {
enum {
// The closed Vulkan driver sets 96, which limits the wave count to 8 but
// doesn't spill SGPRs as much as when 80 is set.
- FIXED_NUM_SGPRS_FOR_INIT_BUG = 96
+ FIXED_NUM_SGPRS_FOR_INIT_BUG = 96,
+ TRAP_NUM_SGPRS = 16
};
-/// \brief Instruction set architecture version.
+/// Instruction set architecture version.
struct IsaVersion {
unsigned Major;
unsigned Minor;
@@ -55,12 +63,12 @@ struct IsaVersion {
/// \returns Isa version for given subtarget \p Features.
IsaVersion getIsaVersion(const FeatureBitset &Features);
-/// \brief Streams isa version string for given subtarget \p STI into \p Stream.
+/// Streams isa version string for given subtarget \p STI into \p Stream.
void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
-/// \returns True if given subtarget \p Features support code object version 3,
+/// \returns True if given subtarget \p STI supports code object version 3,
/// false otherwise.
-bool hasCodeObjectV3(const FeatureBitset &Features);
+bool hasCodeObjectV3(const MCSubtargetInfo *STI);
/// \returns Wavefront size for given subtarget \p Features.
unsigned getWavefrontSize(const FeatureBitset &Features);
@@ -92,7 +100,7 @@ unsigned getMinWavesPerEU(const FeatureBitset &Features);
/// \returns Maximum number of waves per execution unit for given subtarget \p
/// Features without any kind of limitation.
-unsigned getMaxWavesPerEU(const FeatureBitset &Features);
+unsigned getMaxWavesPerEU();
/// \returns Maximum number of waves per execution unit for given subtarget \p
/// Features and limited by given \p FlatWorkGroupSize.
@@ -131,6 +139,22 @@ unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
bool Addressable);
+/// \returns Number of extra SGPRs implicitly required by given subtarget \p
+/// Features when the given special registers are used.
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+ bool FlatScrUsed, bool XNACKUsed);
+
+/// \returns Number of extra SGPRs implicitly required by given subtarget \p
+/// Features when the given special registers are used. XNACK is inferred from
+/// \p Features.
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+ bool FlatScrUsed);
+
+/// \returns Number of SGPR blocks needed for given subtarget \p Features when
+/// \p NumSGPRs are used. \p NumSGPRs should already include any special
+/// register counts.
+unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs);
+
/// \returns VGPR allocation granularity for given subtarget \p Features.
unsigned getVGPRAllocGranule(const FeatureBitset &Features);
@@ -151,20 +175,57 @@ unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
/// execution unit requirement for given subtarget \p Features.
unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+/// \returns Number of VGPR blocks needed for given subtarget \p Features when
+/// \p NumVGPRs are used.
+unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs);
+
} // end namespace IsaInfo
LLVM_READONLY
int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+struct MIMGBaseOpcodeInfo {
+ MIMGBaseOpcode BaseOpcode;
+ bool Store;
+ bool Atomic;
+ bool AtomicX2;
+ bool Sampler;
+
+ uint8_t NumExtraArgs;
+ bool Gradients;
+ bool Coordinates;
+ bool LodOrClampOrMip;
+ bool HasD16;
+};
+
+LLVM_READONLY
+const MIMGBaseOpcodeInfo *getMIMGBaseOpcodeInfo(unsigned BaseOpcode);
+
+struct MIMGDimInfo {
+ MIMGDim Dim;
+ uint8_t NumCoords;
+ uint8_t NumGradients;
+ bool DA;
+};
+
LLVM_READONLY
-int getMaskedMIMGOp(const MCInstrInfo &MII,
- unsigned Opc, unsigned NewChannels);
+const MIMGDimInfo *getMIMGDimInfo(unsigned Dim);
+
+LLVM_READONLY
+int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
+ unsigned VDataDwords, unsigned VAddrDwords);
+
+LLVM_READONLY
+int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);
+
LLVM_READONLY
int getMCOpcode(uint16_t Opcode, unsigned Gen);
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
const FeatureBitset &Features);
+amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor();
+
bool isGroupSegment(const GlobalValue *GV);
bool isGlobalSegment(const GlobalValue *GV);
bool isReadOnlySegment(const GlobalValue *GV);
@@ -216,7 +277,7 @@ unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
/// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version.
unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
-/// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
+/// Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
/// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
/// \p Lgkmcnt respectively.
///
@@ -240,7 +301,7 @@ unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
unsigned Lgkmcnt);
-/// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
+/// Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
/// \p Version.
///
/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
@@ -278,41 +339,45 @@ inline bool isKernel(CallingConv::ID CC) {
}
}
+bool hasXNACK(const MCSubtargetInfo &STI);
+bool hasMIMG_R128(const MCSubtargetInfo &STI);
+bool hasPackedD16(const MCSubtargetInfo &STI);
+
bool isSI(const MCSubtargetInfo &STI);
bool isCI(const MCSubtargetInfo &STI);
bool isVI(const MCSubtargetInfo &STI);
bool isGFX9(const MCSubtargetInfo &STI);
-/// \brief Is Reg - scalar register
+/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
-/// \brief Is there any intersection between registers
+/// Is there any intersection between registers
bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI);
/// If \p Reg is a pseudo reg, return the correct hardware register given
/// \p STI otherwise return \p Reg.
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
-/// \brief Convert hardware register \p Reg to a pseudo register
+/// Convert hardware register \p Reg to a pseudo register
LLVM_READNONE
unsigned mc2PseudoReg(unsigned Reg);
-/// \brief Can this operand also contain immediate values?
+/// Can this operand also contain immediate values?
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);
-/// \brief Is this floating-point operand?
+/// Is this floating-point operand?
bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo);
-/// \brief Does this opearnd support only inlinable literals?
+/// Does this opearnd support only inlinable literals?
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo);
-/// \brief Get the size in bits of a register from the register class \p RC.
+/// Get the size in bits of a register from the register class \p RC.
unsigned getRegBitWidth(unsigned RCID);
-/// \brief Get the size in bits of a register from the register class \p RC.
+/// Get the size in bits of a register from the register class \p RC.
unsigned getRegBitWidth(const MCRegisterClass &RC);
-/// \brief Get size of register operand
+/// Get size of register operand
unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
unsigned OpNo);
@@ -349,7 +414,7 @@ inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
return getOperandSize(Desc.OpInfo[OpNo]);
}
-/// \brief Is this literal inlinable
+/// Is this literal inlinable
LLVM_READNONE
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
@@ -363,7 +428,6 @@ LLVM_READNONE
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
bool isArgPassedInSGPR(const Argument *Arg);
-bool isUniformMMO(const MachineMemOperand *MMO);
/// \returns The encoding that will be used for \p ByteOffset in the SMRD
/// offset field.
@@ -374,6 +438,9 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
/// not the encoded offset.
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
+/// \returns true if the intrinsic is divergent
+bool isIntrinsicSourceOfDivergence(unsigned IntrID);
+
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
new file mode 100644
index 000000000000..1924f71f11c8
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
@@ -0,0 +1,75 @@
+//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MBB A lane-dominates MBB B if
+// 1. A dominates B in the usual sense, i.e. every path from the entry to B
+// goes through A, and
+// 2. whenever B executes, every active lane during that execution of B was
+// also active during the most recent execution of A.
+//
+// The simplest example where A dominates B but does not lane-dominate it is
+// where A is a loop:
+//
+// |
+// +--+
+// A |
+// +--+
+// |
+// B
+//
+// Unfortunately, the second condition is not fully captured by the control
+// flow graph when it is unstructured (as may happen when branch conditions are
+// uniform).
+//
+// The following replacement of the second condition is a conservative
+// approximation. It is an equivalent condition when the CFG is fully
+// structured:
+//
+// 2'. every cycle in the CFG that contains A also contains B.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULaneDominator.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+
+namespace AMDGPU {
+
+// Given machine basic blocks A and B where A dominates B, check whether
+// A lane-dominates B.
+//
+// The check is conservative, i.e. there can be false-negatives.
+bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) {
+ // Check whether A is reachable from itself without going through B.
+ DenseSet<MachineBasicBlock *> Reachable;
+ SmallVector<MachineBasicBlock *, 8> Stack;
+
+ Stack.push_back(A);
+ do {
+ MachineBasicBlock *MBB = Stack.back();
+ Stack.pop_back();
+
+ for (MachineBasicBlock *Succ : MBB->successors()) {
+ if (Succ == A)
+ return false;
+ if (Succ != B && Reachable.insert(Succ).second)
+ Stack.push_back(Succ);
+ }
+ } while (!Stack.empty());
+
+ return true;
+}
+
+} // namespace AMDGPU
+
+} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
new file mode 100644
index 000000000000..4f33a89a364b
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
@@ -0,0 +1,24 @@
+//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
+
+namespace llvm {
+
+class MachineBasicBlock;
+
+namespace AMDGPU {
+
+bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB);
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index 991408c81c92..9f0a4d29b5e4 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -73,7 +73,6 @@ FIELD2(amd_machine_version_stepping, machine_version_stepping, amd_machine_ve
FIELD(kernel_code_entry_byte_offset),
FIELD(kernel_code_prefetch_byte_size),
-FIELD(max_scratch_backing_memory_byte_size),
COMPPGM1(granulated_workitem_vgpr_count, compute_pgm_rsrc1_vgprs, VGPRS),
COMPPGM1(granulated_wavefront_sgpr_count, compute_pgm_rsrc1_sgprs, SGPRS),
diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 01b80ebe8d3d..c5ed32e46821 100644
--- a/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -2,4 +2,5 @@ add_llvm_library(LLVMAMDGPUUtils
AMDGPUBaseInfo.cpp
AMDKernelCodeTUtils.cpp
AMDGPUAsmUtils.cpp
+ AMDGPULaneDominator.cpp
)
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index ff2bd2454400..4c7a92219755 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -40,17 +40,9 @@ class VOP1_SDWA9Ae <bits<8> op, VOPProfile P> : VOP_SDWA9Ae <P> {
}
class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> :
- InstSI <P.Outs32, P.Ins32, "", pattern>,
- VOP <opName>,
- SIMCInstr <!if(VOP1Only, opName, opName#"_e32"), SIEncodingFamily.NONE>,
- MnemonicAlias<!if(VOP1Only, opName, opName#"_e32"), opName> {
+ VOP_Pseudo <opName, !if(VOP1Only, "", "_e32"), P, P.Outs32, P.Ins32, "", pattern> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let UseNamedOperandTable = 1;
-
- string Mnemonic = opName;
- string AsmOperands = P.Asm32;
+ let AsmOperands = P.Asm32;
let Size = 4;
let mayLoad = 0;
@@ -63,8 +55,6 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On
let Uses = [EXEC];
let AsmVariantName = AMDGPUAsmVariants.Default;
-
- VOPProfile Pfl = P;
}
class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
@@ -86,6 +76,7 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
+ let Defs = ps.Defs;
}
class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -202,13 +193,14 @@ defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
-defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
let SchedRW = [WriteQuarterRate32] in {
+defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
-defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32>;
+defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>;
defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
+defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
} // End SchedRW = [WriteQuarterRate32]
let SchedRW = [WriteDouble] in {
@@ -216,8 +208,6 @@ defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
} // End SchedRW = [WriteDouble];
-defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
-
let SchedRW = [WriteDouble] in {
defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>;
} // End SchedRW = [WriteDouble]
@@ -232,9 +222,9 @@ defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>;
defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>;
defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
-defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
let SchedRW = [WriteDoubleAdd] in {
+defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
} // End SchedRW = [WriteDoubleAdd]
@@ -298,9 +288,7 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>;
defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
} // End Uses = [M0, EXEC]
-let SchedRW = [WriteQuarterRate32] in {
defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
-}
// These instruction only exist on SI and CI
let SubtargetPredicate = isSICI in {
@@ -344,11 +332,15 @@ defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
+let SchedRW = [WriteQuarterRate32] in {
defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>;
defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
+defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
+defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
+} // End SchedRW = [WriteQuarterRate32]
defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
@@ -356,8 +348,6 @@ defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
-defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
-defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
}
@@ -392,6 +382,12 @@ let SubtargetPredicate = isGFX9 in {
def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>;
}
+defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
+
+defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>;
+defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
+defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+
} // End SubtargetPredicate = isGFX9
//===----------------------------------------------------------------------===//
@@ -521,7 +517,7 @@ multiclass VOP1Only_Real_vi <bits<10> op> {
}
}
-multiclass VOP1_Real_vi <bits<10> op> {
+multiclass VOP1_Real_e32e64_vi <bits<10> op> {
let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
def _e32_vi :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
@@ -530,6 +526,10 @@ multiclass VOP1_Real_vi <bits<10> op> {
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
VOP3e_vi <!add(0x140, op), !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
+}
+
+multiclass VOP1_Real_vi <bits<10> op> {
+ defm NAME : VOP1_Real_e32e64_vi <op>;
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
@@ -593,9 +593,9 @@ defm V_FRACT_F64 : VOP1_Real_vi <0x32>;
defm V_FREXP_EXP_I32_F32 : VOP1_Real_vi <0x33>;
defm V_FREXP_MANT_F32 : VOP1_Real_vi <0x34>;
defm V_CLREXCP : VOP1_Real_vi <0x35>;
-defm V_MOVRELD_B32 : VOP1_Real_vi <0x36>;
-defm V_MOVRELS_B32 : VOP1_Real_vi <0x37>;
-defm V_MOVRELSD_B32 : VOP1_Real_vi <0x38>;
+defm V_MOVRELD_B32 : VOP1_Real_e32e64_vi <0x36>;
+defm V_MOVRELS_B32 : VOP1_Real_e32e64_vi <0x37>;
+defm V_MOVRELSD_B32 : VOP1_Real_e32e64_vi <0x38>;
defm V_TRUNC_F64 : VOP1_Real_vi <0x17>;
defm V_CEIL_F64 : VOP1_Real_vi <0x18>;
defm V_FLOOR_F64 : VOP1_Real_vi <0x1A>;
@@ -622,6 +622,10 @@ defm V_SIN_F16 : VOP1_Real_vi <0x49>;
defm V_COS_F16 : VOP1_Real_vi <0x4a>;
defm V_SWAP_B32 : VOP1Only_Real_vi <0x51>;
+defm V_SAT_PK_U8_I16 : VOP1_Real_vi<0x4f>;
+defm V_CVT_NORM_I16_F16 : VOP1_Real_vi<0x4d>;
+defm V_CVT_NORM_U16_F16 : VOP1_Real_vi<0x4e>;
+
// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
// indexing mode. vdst can't be treated as a def for codegen purposes,
// and an implicit use and def of the super register should be added.
@@ -694,3 +698,23 @@ def : GCNPat <
>;
} // End OtherPredicates = [isVI]
+
+//===----------------------------------------------------------------------===//
+// GFX9
+//===----------------------------------------------------------------------===//
+
+multiclass VOP1_Real_gfx9 <bits<10> op> {
+ let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
+ defm NAME : VOP1_Real_e32e64_vi <op>;
+ }
+
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+ // For now left dpp only for asm/dasm
+ // TODO: add corresponding pseudo
+ def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
+}
+
+defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index ef90b68db1a8..5ec1a15c5cd2 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -61,17 +61,9 @@ class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> {
}
class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :
- InstSI <P.Outs32, P.Ins32, "", pattern>,
- VOP <opName>,
- SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
- MnemonicAlias<opName#suffix, opName> {
+ VOP_Pseudo <opName, suffix, P, P.Outs32, P.Ins32, "", pattern> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let UseNamedOperandTable = 1;
-
- string Mnemonic = opName;
- string AsmOperands = P.Asm32;
+ let AsmOperands = P.Asm32;
let Size = 4;
let mayLoad = 0;
@@ -84,8 +76,6 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf
let Uses = [EXEC];
let AsmVariantName = AMDGPUAsmVariants.Default;
-
- VOPProfile Pfl = P;
}
class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
@@ -107,6 +97,7 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
+ let Defs = ps.Defs;
}
class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -177,6 +168,10 @@ multiclass VOP2eInst <string opName,
let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in {
def _e32 : VOP2_Pseudo <opName, P>,
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
+ let AsmMatchConverter = "cvtSdwaVOP2b";
+ }
}
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -303,12 +298,30 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above.
let Asm32 = "$vdst, $src0, $src1, vcc";
let Asm64 = "$vdst, $src0, $src1, $src2";
+ let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+
let Outs32 = (outs DstRC:$vdst);
let Outs64 = (outs DstRC:$vdst);
// Suppress src2 implied by type since the 32-bit encoding uses an
// implicit VCC use.
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
+
+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
+ clampmod:$clamp,
+ dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel, src1_sel:$src1_sel);
+
+ let InsDPP = (ins DstRCDPP:$old,
+ Src0DPP:$src0,
+ Src1DPP:$src1,
+ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let HasExt = 1;
+ let HasSDWA9 = 1;
}
def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
@@ -322,15 +335,17 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
let HasSDWA9 = 0;
}
-def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
+def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
let Outs32 = (outs VGPR_32:$vdst);
let Outs64 = Outs32;
- let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1);
+ let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in);
let Ins64 = Ins32;
let Asm32 = " $vdst, $src0, $src1";
let Asm64 = Asm32;
let HasExt = 0;
let HasSDWA9 = 0;
+ let HasSrc2 = 0;
+ let HasSrc2Mods = 0;
}
//===----------------------------------------------------------------------===//
@@ -398,7 +413,10 @@ let isConvergent = 1, Uses = []<Register> in {
def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
[(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
+let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
+ [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">;
+} // End $vdst = $vdst_in, DisableEncoding $vdst_in
} // End isConvergent = 1
defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
@@ -407,11 +425,11 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>;
defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>;
} // End SubtargetPredicate = isGCN
@@ -473,6 +491,19 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
} // End SubtargetPredicate = Has16BitInsts
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32>;
+
+let Constraints = "$vdst = $src2",
+ DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1,
+ isCommutable = 1 in {
+defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
+}
+
+} // End SubtargetPredicate = HasDLInsts
+
// Note: 16-bit instructions produce a 0 result in the high 16-bits.
multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
@@ -639,7 +670,7 @@ defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>;
defm V_READLANE_B32 : VOP2_Real_si <0x01>;
-let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in {
+let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
defm V_WRITELANE_B32 : VOP2_Real_si <0x02>;
}
@@ -824,7 +855,7 @@ multiclass VOP2_Real_e32e64_vi <bits<6> op> :
def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
}
-defm V_CNDMASK_B32 : Base_VOP2_Real_e32e64_vi <0x0>;
+defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>;
defm V_ADD_F32 : VOP2_Real_e32e64_vi <0x1>;
defm V_SUB_F32 : VOP2_Real_e32e64_vi <0x2>;
defm V_SUBREV_F32 : VOP2_Real_e32e64_vi <0x3>;
@@ -926,3 +957,10 @@ def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
} // End SubtargetPredicate = isVI
+
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_FMAC_F32 : VOP2_Real_e32e64_vi <0x3b>;
+defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
+
+} // End SubtargetPredicate = HasDLInsts
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index aedbfa015bf6..17ae08dc6267 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -153,19 +153,24 @@ class getVOP3VCC<VOPProfile P, SDPatternOperator node> {
(i1 VCC)))];
}
-class VOP3Features<bit Clamp, bit OpSel> {
+class VOP3Features<bit Clamp, bit OpSel, bit Packed> {
bit HasClamp = Clamp;
bit HasOpSel = OpSel;
+ bit IsPacked = Packed;
}
-def VOP3_REGULAR : VOP3Features<0, 0>;
-def VOP3_CLAMP : VOP3Features<1, 0>;
-def VOP3_OPSEL : VOP3Features<1, 1>;
+def VOP3_REGULAR : VOP3Features<0, 0, 0>;
+def VOP3_CLAMP : VOP3Features<1, 0, 0>;
+def VOP3_OPSEL : VOP3Features<1, 1, 0>;
+def VOP3_PACKED : VOP3Features<1, 1, 1>;
class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+ let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+
+ let HasModifiers = !if(Features.IsPacked, 1, P.HasModifiers);
// FIXME: Hack to stop printing _e64
let Outs64 = (outs DstRC.RegClass:$vdst);
@@ -283,10 +288,10 @@ def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
-def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
let SchedRW = [WriteDoubleAdd] in {
+def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
@@ -355,14 +360,12 @@ def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPU
def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
let SchedRW = [WriteFloatFMA, WriteSALU];
- let hasExtraSrcRegAllocReq = 1;
let AsmMatchConverter = "";
}
// Double precision division pre-scale.
def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
let SchedRW = [WriteDouble, WriteSALU];
- let hasExtraSrcRegAllocReq = 1;
let AsmMatchConverter = "";
}
@@ -376,6 +379,7 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3
let SchedRW = [WriteDouble];
}
+let SchedRW = [Write64Bit] in {
// These instructions only exist on SI and CI
let SubtargetPredicate = isSICI in {
def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>>;
@@ -389,17 +393,17 @@ def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
} // End SubtargetPredicate = isVI
-
+} // End SchedRW = [Write64Bit]
let SubtargetPredicate = isCIVI in {
-let Constraints = "@earlyclobber $vdst" in {
+let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>;
-} // End Constraints = "@earlyclobber $vdst"
+} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
let isCommutable = 1 in {
-let SchedRW = [WriteDouble, WriteSALU] in {
+let SchedRW = [WriteQuarterRate32, WriteSALU] in {
def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
} // End SchedRW = [WriteDouble, WriteSALU]
@@ -408,16 +412,16 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
} // End SubtargetPredicate = isCIVI
-let SubtargetPredicate = Has16BitInsts in {
-
-let renamedInGFX9 = 1 in {
-def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
+def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> {
+ let Predicates = [Has16BitInsts, isVIOnly];
}
-let SubtargetPredicate = isGFX9 in {
-def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
+ VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> {
+ let renamedInGFX9 = 1;
+ let Predicates = [Has16BitInsts, isGFX9];
}
-let isCommutable = 1 in {
+let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
let renamedInGFX9 = 1 in {
def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
@@ -438,15 +442,14 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f1
def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
-} // End isCommutable = 1
-} // End SubtargetPredicate = Has16BitInsts
+} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
let SubtargetPredicate = isVI in {
def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
-def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
} // End SubtargetPredicate = isVI
let Predicates = [Has16BitInsts] in {
@@ -697,7 +700,7 @@ multiclass VOP3Interp_F16_Real_vi<bits<10> op> {
let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
- def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
+ def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
VOP3e_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
let AsmString = AsmName # ps.AsmOperands;
@@ -705,7 +708,7 @@ multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
}
multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> {
- def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
+ def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME);
let AsmString = AsmName # ps.AsmOperands;
@@ -713,7 +716,7 @@ multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> {
}
multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
- def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
+ def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
VOP3Interp_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
let AsmString = AsmName # ps.AsmOperands;
@@ -721,9 +724,9 @@ multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName>
}
multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> {
- def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
- VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
- VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME);
+ def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.GFX9>,
+ VOP3e_vi <op, !cast<VOP_Pseudo>(NAME).Pfl> {
+ VOP_Pseudo ps = !cast<VOP_Pseudo>(NAME);
let AsmString = AsmName # ps.AsmOperands;
}
}
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index eeee8b36c175..5c78ada3211e 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -68,6 +68,67 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I1
def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
+multiclass MadFmaMixPats<SDPatternOperator fma_like,
+ Instruction mix_inst,
+ Instruction mixlo_inst,
+ Instruction mixhi_inst> {
+ def : GCNPat <
+ (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (mixlo_inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.NONE,
+ (i32 (IMPLICIT_DEF)))
+ >;
+
+ // FIXME: Special case handling for maxhi (especially for clamp)
+ // because dealing with the write to high half of the register is
+ // difficult.
+ def : GCNPat <
+ (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.NONE,
+ $elt0))
+ >;
+
+ def : GCNPat <
+ (build_vector
+ f16:$elt0,
+ (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
+ (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.ENABLE,
+ $elt0))
+ >;
+
+ def : GCNPat <
+ (AMDGPUclamp (build_vector
+ (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
+ (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
+ (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
+ $hi_src1_modifiers, $hi_src1,
+ $hi_src2_modifiers, $hi_src2,
+ DSTCLAMP.ENABLE,
+ (mixlo_inst $lo_src0_modifiers, $lo_src0,
+ $lo_src1_modifiers, $lo_src1,
+ $lo_src2_modifiers, $lo_src2,
+ DSTCLAMP.ENABLE,
+ (i32 (IMPLICIT_DEF)))))
+ >;
+}
let SubtargetPredicate = HasMadMixInsts in {
// These are VOP3a-like opcodes which accept no omod.
@@ -84,68 +145,41 @@ def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16
}
}
-def : GCNPat <
- (f16 (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
- (V_MAD_MIXLO_F16 $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.NONE,
- (i32 (IMPLICIT_DEF)))
->;
+defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
+} // End SubtargetPredicate = HasMadMixInsts
-// FIXME: Special case handling for maxhi (especially for clamp)
-// because dealing with the write to high half of the register is
-// difficult.
-def : GCNPat <
- (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
- (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.NONE,
- $elt0))
->;
-def : GCNPat <
- (build_vector
- f16:$elt0,
- (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
- (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.ENABLE,
- $elt0))
->;
+// Essentially the same as the mad_mix versions
+let SubtargetPredicate = HasFmaMixInsts in {
+let isCommutable = 1 in {
+def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
-def : GCNPat <
- (AMDGPUclamp (build_vector
- (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
- (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
- (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0,
- $hi_src1_modifiers, $hi_src1,
- $hi_src2_modifiers, $hi_src2,
- DSTCLAMP.ENABLE,
- (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0,
- $lo_src1_modifiers, $lo_src1,
- $lo_src2_modifiers, $lo_src2,
- DSTCLAMP.ENABLE,
- (i32 (IMPLICIT_DEF)))))
->;
+// Clamp modifier is applied after conversion to f16.
+def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+
+let ClampLo = 0, ClampHi = 1 in {
+def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+}
+}
+
+defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
+}
-} // End SubtargetPredicate = [HasMadMixInsts]
+let SubtargetPredicate = HasDLInsts in {
+
+def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>;
+def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>;
+def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>;
+def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>;
+def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4>;
+def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8>;
+def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8>;
+
+} // End SubtargetPredicate = HasDLInsts
multiclass VOP3P_Real_vi<bits<10> op> {
- def _vi : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.VI>,
- VOP3Pe <op, !cast<VOP3P_Pseudo>(NAME).Pfl> {
+ def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
let AssemblerPredicates = [HasVOP3PInsts];
let DecoderNamespace = "VI";
}
@@ -172,6 +206,33 @@ defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>;
defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>;
defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>;
+
+let SubtargetPredicate = HasMadMixInsts in {
defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>;
defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+}
+
+let SubtargetPredicate = HasFmaMixInsts in {
+let DecoderNamespace = "GFX9_DL" in {
+// The mad_mix instructions were renamed and their behaviors changed,
+// but the opcode stayed the same so we need to put these in a
+// different DecoderNamespace to avoid the ambiguity.
+defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x3a0>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+}
+}
+
+
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>;
+defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>;
+defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>;
+defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>;
+defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>;
+defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>;
+defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>;
+
+} // End SubtargetPredicate = HasDLInsts
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index 146870e21531..cc6b8116afee 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -30,8 +30,8 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
let Inst{31-25} = 0x3e; // encoding
// VOPC disallows dst_sel and dst_unused as they have no effect on destination
- let Inst{42-40} = SDWA.DWORD;
- let Inst{44-43} = SDWA.UNUSED_PRESERVE;
+ let Inst{42-40} = 0;
+ let Inst{44-43} = 0;
}
class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
@@ -106,6 +106,7 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
+ let Defs = ps.Defs;
}
class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index f24ff5ce8dea..f0f7f259f71d 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -38,6 +38,23 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
let Uses = [EXEC];
}
+class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
+ string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern>,
+ VOP <opName>,
+ SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
+ MnemonicAlias<opName#suffix, opName> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ VOPProfile Pfl = P;
+
+ string AsmOperands;
+}
+
class VOP3Common <dag outs, dag ins, string asm = "",
list<dag> pattern = [], bit HasMods = 0,
bit VOP3Only = 0> :
@@ -66,26 +83,18 @@ class VOP3Common <dag outs, dag ins, string asm = "",
class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
bit VOP3Only = 0, bit isVOP3P = 0, bit isVop3OpSel = 0> :
- InstSI <P.Outs64,
- !if(isVop3OpSel,
- P.InsVOP3OpSel,
- !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)),
- "",
- pattern>,
- VOP <opName>,
- SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
- MnemonicAlias<opName#"_e64", opName> {
+ VOP_Pseudo <opName, "_e64", P, P.Outs64,
+ !if(isVop3OpSel,
+ P.InsVOP3OpSel,
+ !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)),
+ "", pattern> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let UseNamedOperandTable = 1;
let VOP3_OPSEL = isVop3OpSel;
let IsPacked = P.IsPacked;
- string Mnemonic = opName;
- string AsmOperands = !if(isVop3OpSel,
- P.AsmVOP3OpSel,
- !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
+ let AsmOperands = !if(isVop3OpSel,
+ P.AsmVOP3OpSel,
+ !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
let Size = 8;
let mayLoad = 0;
@@ -120,8 +129,6 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
!if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)),
"cvtVOP3",
""));
-
- VOPProfile Pfl = P;
}
class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
@@ -129,7 +136,7 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
let VOP3P = 1;
}
-class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
+class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
@@ -149,13 +156,14 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
+ let Defs = ps.Defs;
VOPProfile Pfl = ps.Pfl;
}
// XXX - Is there any reason to distingusih this from regular VOP3
// here?
-class VOP3P_Real<VOP3P_Pseudo ps, int EncodingFamily> :
+class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily> :
VOP3_Real<ps, EncodingFamily>;
class VOP3a<VOPProfile P> : Enc64 {
@@ -324,13 +332,13 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
bits<1> clamp;
let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
- let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
- let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+ let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0);
+ let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0);
let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);
- let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+ let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
- let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+ let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0);
let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
}
@@ -358,11 +366,11 @@ class VOP_SDWA9e<VOPProfile P> : Enc64 {
bits<1> src1_sgpr;
let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
- let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+ let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
let Inst{55} = !if(P.HasSrc0, src0{8}, 0);
- let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+ let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0);
let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
let Inst{63} = 0; // src1_sgpr - should be specified in subclass
@@ -375,8 +383,8 @@ class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> {
bits<1> clamp;
bits<2> omod;
- let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
- let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+ let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0);
+ let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0);
let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);
let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0);
}