summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h129
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td134
-rw-r--r--lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp22
-rw-r--r--lib/Target/AMDGPU/AMDGPUAliasAnalysis.h25
-rw-r--r--lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp16
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp29
-rw-r--r--lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp131
-rw-r--r--lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h177
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp524
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.h31
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.cpp40
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallingConv.td4
-rw-r--r--lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp47
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.h6
-rw-r--r--lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def4
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp686
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp555
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h61
-rw-r--r--lib/Target/AMDGPU/AMDGPUInline.cpp208
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.cpp33
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td24
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp3
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.h3
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td373
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp12
-rw-r--r--lib/Target/AMDGPU/AMDGPULibCalls.cpp1770
-rw-r--r--lib/Target/AMDGPU/AMDGPULibFunc.cpp1054
-rw-r--r--lib/Target/AMDGPU/AMDGPULibFunc.h459
-rw-r--r--lib/Target/AMDGPU/AMDGPUMCInstLower.cpp26
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp354
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp29
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h97
-rw-r--r--lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp135
-rw-r--r--lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp55
-rw-r--r--lib/Target/AMDGPU/AMDGPUPTNote.h8
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp11
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp8
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.cpp9
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.td2
-rw-r--r--lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp483
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp174
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h127
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp203
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.h16
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp3
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp123
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h59
-rw-r--r--lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp25
-rw-r--r--lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp9
-rw-r--r--lib/Target/AMDGPU/AMDILCFGStructurizer.cpp33
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp391
-rw-r--r--lib/Target/AMDGPU/BUFInstructions.td375
-rw-r--r--lib/Target/AMDGPU/CIInstructions.td15
-rw-r--r--lib/Target/AMDGPU/CMakeLists.txt66
-rw-r--r--lib/Target/AMDGPU/CaymanInstructions.td48
-rw-r--r--lib/Target/AMDGPU/DSInstructions.td640
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp152
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h28
-rw-r--r--lib/Target/AMDGPU/EvergreenInstructions.td89
-rw-r--r--lib/Target/AMDGPU/FLATInstructions.td774
-rw-r--r--lib/Target/AMDGPU/GCNHazardRecognizer.cpp248
-rw-r--r--lib/Target/AMDGPU/GCNHazardRecognizer.h22
-rw-r--r--lib/Target/AMDGPU/GCNILPSched.cpp364
-rw-r--r--lib/Target/AMDGPU/GCNIterativeScheduler.cpp100
-rw-r--r--lib/Target/AMDGPU/GCNIterativeScheduler.h31
-rw-r--r--lib/Target/AMDGPU/GCNMinRegStrategy.cpp32
-rw-r--r--lib/Target/AMDGPU/GCNProcessors.td154
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.cpp39
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.h29
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.cpp34
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp401
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h39
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp31
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp21
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp39
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h15
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp (renamed from lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp)213
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h (renamed from lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h)47
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp17
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h10
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp95
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h38
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp2
-rw-r--r--lib/Target/AMDGPU/MIMGInstructions.td43
-rw-r--r--lib/Target/AMDGPU/Processors.td219
-rw-r--r--lib/Target/AMDGPU/R600ClauseMergePass.cpp17
-rw-r--r--lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp28
-rw-r--r--lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp32
-rw-r--r--lib/Target/AMDGPU/R600FrameLowering.h4
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp52
-rw-r--r--lib/Target/AMDGPU/R600InstrFormats.td14
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.cpp32
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.h3
-rw-r--r--lib/Target/AMDGPU/R600Instructions.td138
-rw-r--r--lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp42
-rw-r--r--lib/Target/AMDGPU/R600Packetizer.cpp10
-rw-r--r--lib/Target/AMDGPU/R600Processors.td90
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.td1
-rw-r--r--lib/Target/AMDGPU/SIAnnotateControlFlow.cpp36
-rw-r--r--lib/Target/AMDGPU/SIDefines.h33
-rw-r--r--lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp88
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRCopies.cpp192
-rw-r--r--lib/Target/AMDGPU/SIFixWWMLiveness.cpp202
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp43
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp147
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.h14
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp1663
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h58
-rw-r--r--lib/Target/AMDGPU/SIInsertSkips.cpp125
-rw-r--r--lib/Target/AMDGPU/SIInsertWaitcnts.cpp249
-rw-r--r--lib/Target/AMDGPU/SIInsertWaits.cpp21
-rw-r--r--lib/Target/AMDGPU/SIInstrFormats.td49
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp636
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h116
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td397
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td547
-rw-r--r--lib/Target/AMDGPU/SILoadStoreOptimizer.cpp532
-rw-r--r--lib/Target/AMDGPU/SILowerControlFlow.cpp90
-rw-r--r--lib/Target/AMDGPU/SILowerI1Copies.cpp7
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp189
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h231
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.cpp83
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.h3
-rw-r--r--lib/Target/AMDGPU/SIMemoryLegalizer.cpp627
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMasking.cpp65
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp252
-rw-r--r--lib/Target/AMDGPU/SIPeepholeSDWA.cpp846
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp139
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h30
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.td70
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp2
-rw-r--r--lib/Target/AMDGPU/SIWholeQuadMode.cpp309
-rw-r--r--lib/Target/AMDGPU/SMInstructions.td40
-rw-r--r--lib/Target/AMDGPU/SOPInstructions.td46
-rw-r--r--lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp4
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp268
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h22
-rw-r--r--lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp27
-rw-r--r--lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h29
-rw-r--r--lib/Target/AMDGPU/VOP1Instructions.td39
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td227
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td432
-rw-r--r--lib/Target/AMDGPU/VOP3PInstructions.td99
-rw-r--r--lib/Target/AMDGPU/VOPCInstructions.td8
-rw-r--r--lib/Target/AMDGPU/VOPInstructions.td57
148 files changed, 18452 insertions, 4821 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 568682899be5..0ddc43ad5033 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -23,6 +23,7 @@ class ModulePass;
class Pass;
class Target;
class TargetMachine;
+class TargetOptions;
class PassRegistry;
class Module;
@@ -34,6 +35,7 @@ FunctionPass *createR600ClauseMergePass();
FunctionPass *createR600Packetizer();
FunctionPass *createR600ControlFlowFinalizer();
FunctionPass *createAMDGPUCFGStructurizerPass();
+FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel);
// SI Passes
FunctionPass *createSIAnnotateControlFlowPass();
@@ -44,12 +46,20 @@ FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass();
FunctionPass *createSIWholeQuadModePass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
+FunctionPass *createSIOptimizeExecMaskingPreRAPass();
FunctionPass *createSIFixSGPRCopiesPass();
+FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIDebuggerInsertNopsPass();
FunctionPass *createSIInsertWaitsPass();
FunctionPass *createSIInsertWaitcntsPass();
+FunctionPass *createSIFixWWMLivenessPass();
+FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
+FunctionPass *createAMDGPUUseNativeCallsPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
+FunctionPass *createAMDGPURewriteOutArgumentsPass();
+
+void initializeAMDGPUDAGToDAGISelPass(PassRegistry&);
void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
extern char &AMDGPUMachineCFGStructurizerID;
@@ -64,6 +74,24 @@ ModulePass *createAMDGPULowerIntrinsicsPass();
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
extern char &AMDGPULowerIntrinsicsID;
+void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
+extern char &AMDGPURewriteOutArgumentsID;
+
+void initializeR600ClauseMergePassPass(PassRegistry &);
+extern char &R600ClauseMergePassID;
+
+void initializeR600ControlFlowFinalizerPass(PassRegistry &);
+extern char &R600ControlFlowFinalizerID;
+
+void initializeR600ExpandSpecialInstrsPassPass(PassRegistry &);
+extern char &R600ExpandSpecialInstrsPassID;
+
+void initializeR600VectorRegMergerPass(PassRegistry &);
+extern char &R600VectorRegMergerID;
+
+void initializeR600PacketizerPass(PassRegistry &);
+extern char &R600PacketizerID;
+
void initializeSIFoldOperandsPass(PassRegistry &);
extern char &SIFoldOperandsID;
@@ -97,14 +125,24 @@ extern char &SIInsertSkipsPassID;
void initializeSIOptimizeExecMaskingPass(PassRegistry &);
extern char &SIOptimizeExecMaskingID;
+void initializeSIFixWWMLivenessPass(PassRegistry &);
+extern char &SIFixWWMLivenessID;
+
+void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &);
+extern char &AMDGPUSimplifyLibCallsID;
+
+void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
+extern char &AMDGPUUseNativeCallsID;
+
// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
extern char &AMDGPUPromoteAllocaID;
Pass *createAMDGPUStructurizeCFGPass();
-FunctionPass *createAMDGPUISelDag(TargetMachine &TM,
- CodeGenOpt::Level OptLevel);
+FunctionPass *createAMDGPUISelDag(
+ TargetMachine *TM = nullptr,
+ CodeGenOpt::Level OptLevel = CodeGenOpt::Default);
ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
FunctionPass *createAMDGPUAnnotateUniformValues();
@@ -113,8 +151,8 @@ ModulePass* createAMDGPUUnifyMetadataPass();
void initializeAMDGPUUnifyMetadataPass(PassRegistry&);
extern char &AMDGPUUnifyMetadataID;
-void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
-extern char &SIFixControlFlowLiveIntervalsID;
+void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&);
+extern char &SIOptimizeExecMaskingPreRAID;
void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
extern char &AMDGPUAnnotateUniformValuesPassID;
@@ -125,6 +163,9 @@ extern char &AMDGPUCodeGenPrepareID;
void initializeSIAnnotateControlFlowPass(PassRegistry&);
extern char &SIAnnotateControlFlowPassID;
+void initializeSIMemoryLegalizerPass(PassRegistry&);
+extern char &SIMemoryLegalizerID;
+
void initializeSIDebuggerInsertNopsPass(PassRegistry&);
extern char &SIDebuggerInsertNopsID;
@@ -140,6 +181,15 @@ extern char &AMDGPUUnifyDivergentExitNodesID;
ImmutablePass *createAMDGPUAAWrapperPass();
void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
+void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
+
+Pass *createAMDGPUFunctionInliningPass();
+void initializeAMDGPUInlinerPass(PassRegistry&);
+
+ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
+void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
+extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
+
Target &getTheAMDGPUTarget();
Target &getTheGCNTarget();
@@ -167,39 +217,44 @@ struct AMDGPUAS {
unsigned FLAT_ADDRESS; ///< Address space for flat memory.
unsigned REGION_ADDRESS; ///< Address space for region memory.
- // The maximum value for flat, generic, local, private, constant and region.
- const static unsigned MAX_COMMON_ADDRESS = 5;
-
- const static unsigned GLOBAL_ADDRESS = 1; ///< Address space for global memory (RAT0, VTX0).
- const static unsigned CONSTANT_ADDRESS = 2; ///< Address space for constant memory (VTX2)
- const static unsigned LOCAL_ADDRESS = 3; ///< Address space for local memory.
- const static unsigned PARAM_D_ADDRESS = 6; ///< Address space for direct addressible parameter memory (CONST0)
- const static unsigned PARAM_I_ADDRESS = 7; ///< Address space for indirect addressible parameter memory (VTX1)
-
- // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this
- // order to be able to dynamically index a constant buffer, for example:
- //
- // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
-
- const static unsigned CONSTANT_BUFFER_0 = 8;
- const static unsigned CONSTANT_BUFFER_1 = 9;
- const static unsigned CONSTANT_BUFFER_2 = 10;
- const static unsigned CONSTANT_BUFFER_3 = 11;
- const static unsigned CONSTANT_BUFFER_4 = 12;
- const static unsigned CONSTANT_BUFFER_5 = 13;
- const static unsigned CONSTANT_BUFFER_6 = 14;
- const static unsigned CONSTANT_BUFFER_7 = 15;
- const static unsigned CONSTANT_BUFFER_8 = 16;
- const static unsigned CONSTANT_BUFFER_9 = 17;
- const static unsigned CONSTANT_BUFFER_10 = 18;
- const static unsigned CONSTANT_BUFFER_11 = 19;
- const static unsigned CONSTANT_BUFFER_12 = 20;
- const static unsigned CONSTANT_BUFFER_13 = 21;
- const static unsigned CONSTANT_BUFFER_14 = 22;
- const static unsigned CONSTANT_BUFFER_15 = 23;
-
- // Some places use this if the address space can't be determined.
- const static unsigned UNKNOWN_ADDRESS_SPACE = ~0u;
+ enum : unsigned {
+ // The maximum value for flat, generic, local, private, constant and region.
+ MAX_COMMON_ADDRESS = 5,
+
+ GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
+ CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
+ LOCAL_ADDRESS = 3, ///< Address space for local memory.
+ /// Address space for direct addressible parameter memory (CONST0)
+ PARAM_D_ADDRESS = 6,
+ /// Address space for indirect addressible parameter memory (VTX1)
+ PARAM_I_ADDRESS = 7,
+
+ // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on
+ // this order to be able to dynamically index a constant buffer, for
+ // example:
+ //
+ // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
+
+ CONSTANT_BUFFER_0 = 8,
+ CONSTANT_BUFFER_1 = 9,
+ CONSTANT_BUFFER_2 = 10,
+ CONSTANT_BUFFER_3 = 11,
+ CONSTANT_BUFFER_4 = 12,
+ CONSTANT_BUFFER_5 = 13,
+ CONSTANT_BUFFER_6 = 14,
+ CONSTANT_BUFFER_7 = 15,
+ CONSTANT_BUFFER_8 = 16,
+ CONSTANT_BUFFER_9 = 17,
+ CONSTANT_BUFFER_10 = 18,
+ CONSTANT_BUFFER_11 = 19,
+ CONSTANT_BUFFER_12 = 20,
+ CONSTANT_BUFFER_13 = 21,
+ CONSTANT_BUFFER_14 = 22,
+ CONSTANT_BUFFER_15 = 23,
+
+ // Some places use this if the address space can't be determined.
+ UNKNOWN_ADDRESS_SPACE = ~0u,
+ };
};
namespace llvm {
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index f1d899c4d003..c02d0a131041 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -19,6 +19,12 @@ def FeatureFP64 : SubtargetFeature<"fp64",
"Enable double precision operations"
>;
+def FeatureFMA : SubtargetFeature<"fmaf",
+ "FMA",
+ "true",
+ "Enable single precision FMA (not as fast as mul+add, but fused)"
+>;
+
def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
"FastFMAF32",
"true",
@@ -79,6 +85,12 @@ def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
"Have scratch_* flat memory instructions"
>;
+def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
+ "AddNoCarryInsts",
+ "true",
+ "Have VALU add/sub instructions without carry out"
+>;
+
def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
"UnalignedBufferAccess",
"true",
@@ -103,6 +115,12 @@ def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
"Has Memory Aperture Base and Size Registers"
>;
+def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts",
+ "HasMadMixInsts",
+ "true",
+ "Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions"
+>;
+
// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
// XNACK. The current default kernel driver setting is:
// - graphics ring: XNACK disabled
@@ -119,7 +137,7 @@ def FeatureXNACK : SubtargetFeature<"xnack",
def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
"SGPRInitBug",
"true",
- "VI SGPR initilization bug requiring a fixed SGPR allocation size"
+ "VI SGPR initialization bug requiring a fixed SGPR allocation size"
>;
class SubtargetFeatureFetchLimit <string Value> :
@@ -166,12 +184,6 @@ def FeatureGCN : SubtargetFeature<"gcn",
"GCN or newer GPU"
>;
-def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding",
- "GCN1Encoding",
- "true",
- "Encoding format for SI and CI"
->;
-
def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
"GCN3Encoding",
"true",
@@ -181,13 +193,13 @@ def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
def FeatureCIInsts : SubtargetFeature<"ci-insts",
"CIInsts",
"true",
- "Additional intstructions for CI+"
+ "Additional instructions for CI+"
>;
def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
"GFX9Insts",
"true",
- "Additional intstructions for GFX9+"
+ "Additional instructions for GFX9+"
>;
def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
@@ -274,6 +286,12 @@ def FeatureDPP : SubtargetFeature<"dpp",
"Support DPP (Data Parallel Primitives) extension"
>;
+def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
+ "HasIntClamp",
+ "true",
+ "Support clamp for integer destination"
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -334,6 +352,13 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
+def FeatureEnableHugePrivateBuffer : SubtargetFeature<
+ "huge-private-buffer",
+ "EnableHugePrivateBuffer",
+ "true",
+ "Enable private/scratch buffer sizes greater than 128 GB"
+>;
+
def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
"EnableVGPRSpilling",
"true",
@@ -402,6 +427,13 @@ def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature <
"Hardware automatically inserts waitcnt before barrier"
>;
+def FeatureCodeObjectV3 : SubtargetFeature <
+ "code-object-v3",
+ "CodeObjectV3",
+ "true",
+ "Generate code object version 3"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
@@ -436,14 +468,14 @@ def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
[FeatureFP64, FeatureLocalMemorySize32768,
- FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding,
+ FeatureWavefrontSize64, FeatureGCN,
FeatureLDSBankCount32, FeatureMovrel]
>;
def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
[FeatureFP64, FeatureLocalMemorySize65536,
FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
- FeatureGCN1Encoding, FeatureCIInsts, FeatureMovrel]
+ FeatureCIInsts, FeatureMovrel]
>;
def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
@@ -452,7 +484,8 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
FeatureScalarStores, FeatureInv2PiInlineImm,
- FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP
+ FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
+ FeatureIntClamp
]
>;
@@ -462,9 +495,10 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
- FeatureFastFMAF32, FeatureDPP,
+ FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
- FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
+ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
+ FeatureAddNoCarryInsts
]
>;
@@ -506,6 +540,10 @@ def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
[FeatureSeaIslands,
FeatureLDSBankCount16]>;
+def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4,
+ [FeatureSeaIslands,
+ FeatureLDSBankCount32]>;
+
def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0,
[FeatureVolcanicIslands,
FeatureLDSBankCount32,
@@ -513,6 +551,8 @@ def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0,
def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
[FeatureVolcanicIslands,
+ FeatureFastFMAF32,
+ HalfRate64Ops,
FeatureLDSBankCount32,
FeatureXNACK]>;
@@ -525,10 +565,6 @@ def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
[FeatureVolcanicIslands,
FeatureLDSBankCount32]>;
-def FeatureISAVersion8_0_4 : SubtargetFeatureISAVersion <8,0,4,
- [FeatureVolcanicIslands,
- FeatureLDSBankCount32]>;
-
def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
[FeatureVolcanicIslands,
FeatureLDSBankCount16,
@@ -536,21 +572,15 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
[FeatureGFX9,
- FeatureLDSBankCount32]>;
-
-def FeatureISAVersion9_0_1 : SubtargetFeatureISAVersion <9,0,1,
- [FeatureGFX9,
- FeatureLDSBankCount32,
- FeatureXNACK]>;
+ FeatureMadMixInsts,
+ FeatureLDSBankCount32
+ ]>;
def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
[FeatureGFX9,
- FeatureLDSBankCount32]>;
-
-def FeatureISAVersion9_0_3 : SubtargetFeatureISAVersion <9,0,3,
- [FeatureGFX9,
- FeatureLDSBankCount32,
- FeatureXNACK]>;
+ FeatureMadMixInsts,
+ FeatureLDSBankCount32
+ ]>;
//===----------------------------------------------------------------------===//
// Debugger related subtarget features.
@@ -660,7 +690,7 @@ def TruePredicate : Predicate<"true">;
def isSICI : Predicate<
"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
->, AssemblerPredicate<"FeatureGCN1Encoding">;
+>, AssemblerPredicate<"!FeatureGCN3Encoding">;
def isVI : Predicate <
"Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
@@ -680,6 +710,23 @@ def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
AssemblerPredicate<"FeatureFlatGlobalInsts">;
+def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
+ AssemblerPredicate<"FeatureFlatScratchInsts">;
+def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
+ AssemblerPredicate<"FeatureGFX9Insts">;
+
+
+def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
+def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
+
+def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
+ AssemblerPredicate<"FeatureGFX9Insts">;
+
+def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarryInsts()">,
+ AssemblerPredicate<"FeatureAddNoCarryInsts">;
+
+def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarryInsts()">,
+ AssemblerPredicate<"!FeatureAddNoCarryInsts">;
def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
AssemblerPredicate<"Feature16BitInsts">;
@@ -695,22 +742,41 @@ def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">,
def HasDPP : Predicate<"Subtarget->hasDPP()">,
AssemblerPredicate<"FeatureDPP">;
+def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
+ AssemblerPredicate<"FeatureIntClamp">;
+
+def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
+ AssemblerPredicate<"FeatureMadMixInsts">;
+
+def EnableLateCFGStructurize : Predicate<
+ "EnableLateStructurizeCFG">;
+
+// Exists to help track down where SubtargetPredicate isn't set rather
+// than letting tablegen crash with an unhelpful error.
+def InvalidPred : Predicate<"predicate not set on instruction or pattern">;
+
class PredicateControl {
- Predicate SubtargetPredicate;
+ Predicate SubtargetPredicate = InvalidPred;
Predicate SIAssemblerPredicate = isSICI;
Predicate VIAssemblerPredicate = isVI;
list<Predicate> AssemblerPredicates = [];
Predicate AssemblerPredicate = TruePredicate;
list<Predicate> OtherPredicates = [];
- list<Predicate> Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate],
+ list<Predicate> Predicates = !listconcat([SubtargetPredicate,
+ AssemblerPredicate],
AssemblerPredicates,
OtherPredicates);
}
+class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
+ PredicateControl;
+
+
// Include AMDGPU TD files
include "R600Schedule.td"
+include "R600Processors.td"
include "SISchedule.td"
-include "Processors.td"
+include "GCNProcessors.td"
include "AMDGPUInstrInfo.td"
include "AMDGPUIntrinsics.td"
include "AMDGPURegisterInfo.td"
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index faa424eb0a64..392b011e387c 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -1,4 +1,4 @@
-//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==//
+//===- AMDGPUAliasAnalysis ------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -12,13 +12,21 @@
#include "AMDGPUAliasAnalysis.h"
#include "AMDGPU.h"
+#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
using namespace llvm;
@@ -26,6 +34,7 @@ using namespace llvm;
// Register this pass...
char AMDGPUAAWrapperPass::ID = 0;
+
INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa",
"AMDGPU Address space based Alias Analysis", false, true)
@@ -120,8 +129,11 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
switch (F->getCallingConv()) {
default:
return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
- case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_LS:
+ case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_GS:
+ case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_KERNEL:
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index 5f8ed9b1f9a3..645a38af753c 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -1,4 +1,4 @@
-//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==//
+//===- AMDGPUAliasAnalysis --------------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -10,17 +10,24 @@
/// This is the AMGPU address space based alias analysis pass.
//===----------------------------------------------------------------------===//
-#ifndef LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H
-#define LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H
#include "AMDGPU.h"
+#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
+#include <algorithm>
+#include <memory>
namespace llvm {
+class DataLayout;
+class MDNode;
+class MemoryLocation;
+
/// A simple AA result that uses TBAA metadata to answer queries.
class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> {
friend AAResultBase<AMDGPUAAResult>;
@@ -50,7 +57,9 @@ private:
class ASAliasRulesTy {
public:
ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_);
+
AliasResult getAliasResult(unsigned AS1, unsigned AS2) const;
+
private:
Triple::ArchType Arch;
AMDGPUAS AS;
@@ -61,10 +70,11 @@ private:
/// Analysis pass providing a never-invalidated alias analysis result.
class AMDGPUAA : public AnalysisInfoMixin<AMDGPUAA> {
friend AnalysisInfoMixin<AMDGPUAA>;
+
static char PassID;
public:
- typedef AMDGPUAAResult Result;
+ using Result = AMDGPUAAResult;
AMDGPUAAResult run(Function &F, AnalysisManager<Function> &AM) {
return AMDGPUAAResult(F.getParent()->getDataLayout(),
@@ -91,12 +101,15 @@ public:
Triple(M.getTargetTriple())));
return false;
}
+
bool doFinalization(Module &M) override {
Result.reset();
return false;
}
+
void getAnalysisUsage(AnalysisUsage &AU) const override;
};
-}
-#endif // LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 6f3742ed039b..c27425443abc 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -21,6 +21,12 @@ using namespace llvm;
namespace {
+static cl::opt<bool> StressCalls(
+ "amdgpu-stress-function-calls",
+ cl::Hidden,
+ cl::desc("Force all functions to be noinline"),
+ cl::init(false));
+
class AMDGPUAlwaysInline : public ModulePass {
bool GlobalOpt;
@@ -57,9 +63,13 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
}
}
+ auto NewAttr = StressCalls ? Attribute::NoInline : Attribute::AlwaysInline;
+ auto IncompatAttr
+ = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
+
for (Function &F : M) {
if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
- !F.hasFnAttribute(Attribute::NoInline))
+ !F.hasFnAttribute(IncompatAttr))
FuncsToClone.push_back(&F);
}
@@ -71,8 +81,8 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
}
for (Function &F : M) {
- if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) {
- F.addFnAttr(Attribute::AlwaysInline);
+ if (F.hasLocalLinkage() && !F.hasFnAttribute(IncompatAttr)) {
+ F.addFnAttr(NewAttr);
}
}
return false;
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index c68e5861ff25..ce17202f3414 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===//
+//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -14,13 +14,28 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
-#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
@@ -42,6 +57,7 @@ public:
bool doInitialization(CallGraph &CG) override;
bool runOnSCC(CallGraphSCC &SCC) override;
+
StringRef getPassName() const override {
return "AMDGPU Annotate Kernel Features";
}
@@ -58,7 +74,7 @@ public:
AMDGPUAS AS);
};
-}
+} // end anonymous namespace
char AMDGPUAnnotateKernelFeatures::ID = 0;
@@ -156,8 +172,9 @@ static StringRef intrinsicToAttrName(Intrinsic::ID ID,
case Intrinsic::amdgcn_dispatch_id:
return "amdgpu-dispatch-id";
case Intrinsic::amdgcn_kernarg_segment_ptr:
- case Intrinsic::amdgcn_implicitarg_ptr:
return "amdgpu-kernarg-segment-ptr";
+ case Intrinsic::amdgcn_implicitarg_ptr:
+ return "amdgpu-implicitarg-ptr";
case Intrinsic::amdgcn_queue_ptr:
case Intrinsic::trap:
case Intrinsic::debugtrap:
@@ -190,7 +207,8 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
{ "amdgpu-work-group-id-z" },
{ "amdgpu-dispatch-ptr" },
{ "amdgpu-dispatch-id" },
- { "amdgpu-kernarg-segment-ptr" }
+ { "amdgpu-kernarg-segment-ptr" },
+ { "amdgpu-implicitarg-ptr" }
};
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
@@ -292,7 +310,6 @@ bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
Changed |= addFeatureAttributes(*F);
}
-
return Changed;
}
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
new file mode 100644
index 000000000000..dcca3a2fab96
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -0,0 +1,131 @@
+//===----------------------------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUArgumentUsageInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-argument-reg-usage-info"
+
+INITIALIZE_PASS(AMDGPUArgumentUsageInfo, DEBUG_TYPE,
+ "Argument Register Usage Information Storage", false, true)
+
+void ArgDescriptor::print(raw_ostream &OS,
+ const TargetRegisterInfo *TRI) const {
+ if (!isSet()) {
+ OS << "<not set>\n";
+ return;
+ }
+
+ if (isRegister())
+ OS << "Reg " << printReg(getRegister(), TRI) << '\n';
+ else
+ OS << "Stack offset " << getStackOffset() << '\n';
+}
+
+char AMDGPUArgumentUsageInfo::ID = 0;
+
+const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{};
+
+bool AMDGPUArgumentUsageInfo::doInitialization(Module &M) {
+ return false;
+}
+
+bool AMDGPUArgumentUsageInfo::doFinalization(Module &M) {
+ ArgInfoMap.clear();
+ return false;
+}
+
+void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
+ for (const auto &FI : ArgInfoMap) {
+ OS << "Arguments for " << FI.first->getName() << '\n'
+ << " PrivateSegmentBuffer: " << FI.second.PrivateSegmentBuffer
+ << " DispatchPtr: " << FI.second.DispatchPtr
+ << " QueuePtr: " << FI.second.QueuePtr
+ << " KernargSegmentPtr: " << FI.second.KernargSegmentPtr
+ << " DispatchID: " << FI.second.DispatchID
+ << " FlatScratchInit: " << FI.second.FlatScratchInit
+ << " PrivateSegmentSize: " << FI.second.PrivateSegmentSize
+ << " GridWorkgroupCountX: " << FI.second.GridWorkGroupCountX
+ << " GridWorkgroupCountY: " << FI.second.GridWorkGroupCountY
+ << " GridWorkgroupCountZ: " << FI.second.GridWorkGroupCountZ
+ << " WorkGroupIDX: " << FI.second.WorkGroupIDX
+ << " WorkGroupIDY: " << FI.second.WorkGroupIDY
+ << " WorkGroupIDZ: " << FI.second.WorkGroupIDZ
+ << " WorkGroupInfo: " << FI.second.WorkGroupInfo
+ << " PrivateSegmentWaveByteOffset: "
+ << FI.second.PrivateSegmentWaveByteOffset
+ << " ImplicitBufferPtr: " << FI.second.ImplicitBufferPtr
+ << " ImplicitArgPtr: " << FI.second.ImplicitArgPtr
+ << " WorkItemIDX " << FI.second.WorkItemIDX
+ << " WorkItemIDY " << FI.second.WorkItemIDY
+ << " WorkItemIDZ " << FI.second.WorkItemIDZ
+ << '\n';
+ }
+}
+
+std::pair<const ArgDescriptor *, const TargetRegisterClass *>
+AMDGPUFunctionArgInfo::getPreloadedValue(
+ AMDGPUFunctionArgInfo::PreloadedValue Value) const {
+ switch (Value) {
+ case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER: {
+ return std::make_pair(
+ PrivateSegmentBuffer ? &PrivateSegmentBuffer : nullptr,
+ &AMDGPU::SGPR_128RegClass);
+ }
+ case AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR:
+ return std::make_pair(ImplicitBufferPtr ? &ImplicitBufferPtr : nullptr,
+ &AMDGPU::SGPR_64RegClass);
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
+ return std::make_pair(WorkGroupIDX ? &WorkGroupIDX : nullptr,
+ &AMDGPU::SGPR_32RegClass);
+
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
+ return std::make_pair(WorkGroupIDY ? &WorkGroupIDY : nullptr,
+ &AMDGPU::SGPR_32RegClass);
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
+ return std::make_pair(WorkGroupIDZ ? &WorkGroupIDZ : nullptr,
+ &AMDGPU::SGPR_32RegClass);
+ case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
+ return std::make_pair(
+ PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr,
+ &AMDGPU::SGPR_32RegClass);
+ case AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR:
+ return std::make_pair(KernargSegmentPtr ? &KernargSegmentPtr : nullptr,
+ &AMDGPU::SGPR_64RegClass);
+ case AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR:
+ return std::make_pair(ImplicitArgPtr ? &ImplicitArgPtr : nullptr,
+ &AMDGPU::SGPR_64RegClass);
+ case AMDGPUFunctionArgInfo::DISPATCH_ID:
+ return std::make_pair(DispatchID ? &DispatchID : nullptr,
+ &AMDGPU::SGPR_64RegClass);
+ case AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT:
+ return std::make_pair(FlatScratchInit ? &FlatScratchInit : nullptr,
+ &AMDGPU::SGPR_64RegClass);
+ case AMDGPUFunctionArgInfo::DISPATCH_PTR:
+ return std::make_pair(DispatchPtr ? &DispatchPtr : nullptr,
+ &AMDGPU::SGPR_64RegClass);
+ case AMDGPUFunctionArgInfo::QUEUE_PTR:
+ return std::make_pair(QueuePtr ? &QueuePtr : nullptr,
+ &AMDGPU::SGPR_64RegClass);
+ case AMDGPUFunctionArgInfo::WORKITEM_ID_X:
+ return std::make_pair(WorkItemIDX ? &WorkItemIDX : nullptr,
+ &AMDGPU::VGPR_32RegClass);
+ case AMDGPUFunctionArgInfo::WORKITEM_ID_Y:
+ return std::make_pair(WorkItemIDY ? &WorkItemIDY : nullptr,
+ &AMDGPU::VGPR_32RegClass);
+ case AMDGPUFunctionArgInfo::WORKITEM_ID_Z:
+ return std::make_pair(WorkItemIDZ ? &WorkItemIDZ : nullptr,
+ &AMDGPU::VGPR_32RegClass);
+ }
+ llvm_unreachable("unexpected preloaded value type");
+}
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
new file mode 100644
index 000000000000..bf9635549a8c
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -0,0 +1,177 @@
+//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+class Function;
+class raw_ostream;
+class SISubtarget;
+class TargetMachine;
+class TargetRegisterClass;
+class TargetRegisterInfo;
+
+struct ArgDescriptor {
+private:
+ friend struct AMDGPUFunctionArgInfo;
+ friend class AMDGPUArgumentUsageInfo;
+
+ union {
+ unsigned Register;
+ unsigned StackOffset;
+ };
+
+ bool IsStack : 1;
+ bool IsSet : 1;
+
+ ArgDescriptor(unsigned Val = 0, bool IsStack = false, bool IsSet = false)
+ : Register(Val), IsStack(IsStack), IsSet(IsSet) {}
+public:
+ static ArgDescriptor createRegister(unsigned Reg) {
+ return ArgDescriptor(Reg, false, true);
+ }
+
+ static ArgDescriptor createStack(unsigned Reg) {
+ return ArgDescriptor(Reg, true, true);
+ }
+
+ bool isSet() const {
+ return IsSet;
+ }
+
+ explicit operator bool() const {
+ return isSet();
+ }
+
+ bool isRegister() const {
+ return !IsStack;
+ }
+
+ unsigned getRegister() const {
+ assert(!IsStack);
+ return Register;
+ }
+
+ unsigned getStackOffset() const {
+ assert(IsStack);
+ return StackOffset;
+ }
+
+ void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
+ Arg.print(OS);
+ return OS;
+}
+
+struct AMDGPUFunctionArgInfo {
+ enum PreloadedValue {
+ // SGPRS:
+ PRIVATE_SEGMENT_BUFFER = 0,
+ DISPATCH_PTR = 1,
+ QUEUE_PTR = 2,
+ KERNARG_SEGMENT_PTR = 3,
+ DISPATCH_ID = 4,
+ FLAT_SCRATCH_INIT = 5,
+ WORKGROUP_ID_X = 10,
+ WORKGROUP_ID_Y = 11,
+ WORKGROUP_ID_Z = 12,
+ PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+ IMPLICIT_BUFFER_PTR = 15,
+ IMPLICIT_ARG_PTR = 16,
+
+ // VGPRS:
+ WORKITEM_ID_X = 17,
+ WORKITEM_ID_Y = 18,
+ WORKITEM_ID_Z = 19,
+ FIRST_VGPR_VALUE = WORKITEM_ID_X
+ };
+
+ // Kernel input registers setup for the HSA ABI in allocation order.
+
+ // User SGPRs in kernels
+ // XXX - Can these require argument spills?
+ ArgDescriptor PrivateSegmentBuffer;
+ ArgDescriptor DispatchPtr;
+ ArgDescriptor QueuePtr;
+ ArgDescriptor KernargSegmentPtr;
+ ArgDescriptor DispatchID;
+ ArgDescriptor FlatScratchInit;
+ ArgDescriptor PrivateSegmentSize;
+ ArgDescriptor GridWorkGroupCountX;
+ ArgDescriptor GridWorkGroupCountY;
+ ArgDescriptor GridWorkGroupCountZ;
+
+ // System SGPRs in kernels.
+ ArgDescriptor WorkGroupIDX;
+ ArgDescriptor WorkGroupIDY;
+ ArgDescriptor WorkGroupIDZ;
+ ArgDescriptor WorkGroupInfo;
+ ArgDescriptor PrivateSegmentWaveByteOffset;
+
+ // Pointer with offset from kernargsegmentptr to where special ABI arguments
+ // are passed to callable functions.
+ ArgDescriptor ImplicitArgPtr;
+
+ // Input registers for non-HSA ABI
+ ArgDescriptor ImplicitBufferPtr = 0;
+
+ // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
+ ArgDescriptor WorkItemIDX;
+ ArgDescriptor WorkItemIDY;
+ ArgDescriptor WorkItemIDZ;
+
+ std::pair<const ArgDescriptor *, const TargetRegisterClass *>
+ getPreloadedValue(PreloadedValue Value) const;
+};
+
+class AMDGPUArgumentUsageInfo : public ImmutablePass {
+private:
+ static const AMDGPUFunctionArgInfo ExternFunctionInfo;
+ DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
+
+public:
+ static char ID;
+
+ AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+
+ bool doInitialization(Module &M) override;
+ bool doFinalization(Module &M) override;
+
+ void print(raw_ostream &OS, const Module *M = nullptr) const override;
+
+ void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
+ ArgInfoMap[&F] = ArgInfo;
+ }
+
+ const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const {
+ auto I = ArgInfoMap.find(&F);
+ if (I == ArgInfoMap.end()) {
+ assert(F.isDeclaration());
+ return ExternFunctionInfo;
+ }
+
+ return I->second;
+ }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 2247814cfe55..bb628b8c558f 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -32,15 +32,17 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
using namespace llvm;
+using namespace llvm::AMDGPU;
// TODO: This should get the default rounding mode from the kernel. We just set
// the default here, but this could change if the OpenCL rounding mode pragmas
@@ -105,28 +107,71 @@ const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const {
return TM.getMCSubtargetInfo();
}
-AMDGPUTargetStreamer& AMDGPUAsmPrinter::getTargetStreamer() const {
- return static_cast<AMDGPUTargetStreamer&>(*OutStreamer->getTargetStreamer());
+AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
+ if (!OutStreamer)
+ return nullptr;
+ return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
}
void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
- if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
+ if (TM.getTargetTriple().getArch() != Triple::amdgcn)
+ return;
+
+ if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
+ TM.getTargetTriple().getOS() != Triple::AMDPAL)
+ return;
+
+ if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
+ HSAMetadataStream.begin(M);
+
+ if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
+ readPALMetadata(M);
+
+ // Deprecated notes are not emitted for code object v3.
+ if (IsaInfo::hasCodeObjectV3(getSTI()->getFeatureBits()))
return;
- AMDGPU::IsaInfo::IsaVersion ISA =
- AMDGPU::IsaInfo::getIsaVersion(getSTI()->getFeatureBits());
+ // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
+ if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
+ getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
- getTargetStreamer().EmitDirectiveHSACodeObjectVersion(2, 1);
- getTargetStreamer().EmitDirectiveHSACodeObjectISA(
+ // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
+ IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(getSTI()->getFeatureBits());
+ getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU");
- getTargetStreamer().EmitStartOfCodeObjectMetadata(M);
}
void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
- if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
+ if (TM.getTargetTriple().getArch() != Triple::amdgcn)
+ return;
+
+ // Following code requires TargetStreamer to be present.
+ if (!getTargetStreamer())
return;
- getTargetStreamer().EmitEndOfCodeObjectMetadata();
+ // Emit ISA Version (NT_AMD_AMDGPU_ISA).
+ std::string ISAVersionString;
+ raw_string_ostream ISAVersionStream(ISAVersionString);
+ IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
+ getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
+
+ // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
+ if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
+ HSAMetadataStream.end();
+ getTargetStreamer()->EmitHSAMetadata(HSAMetadataStream.getHSAMetadata());
+ }
+
+ // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
+ if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
+ // Copy the PAL metadata from the map where we collected it into a vector,
+ // then write it as a .note.
+ PALMD::Metadata PALMetadataVector;
+ for (auto i : PALMetadataMap) {
+ PALMetadataVector.push_back(i.first);
+ PALMetadataVector.push_back(i.second);
+ }
+ getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
+ }
}
bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
@@ -154,13 +199,15 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
- getTargetStreamer().EmitAMDKernelCodeT(KernelCode);
+ getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
}
if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
return;
- getTargetStreamer().EmitKernelCodeObjectMetadata(*MF->getFunction(),
- KernelCode);
+
+ HSAMetadataStream.emitKernel(MF->getFunction(),
+ getHSACodeProps(*MF, CurrentProgramInfo),
+ getHSADebugProps(*MF, CurrentProgramInfo));
}
void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
@@ -168,18 +215,38 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) {
SmallString<128> SymbolName;
- getNameWithPrefix(SymbolName, MF->getFunction()),
- getTargetStreamer().EmitAMDGPUSymbolType(
+ getNameWithPrefix(SymbolName, &MF->getFunction()),
+ getTargetStreamer()->EmitAMDGPUSymbolType(
SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
}
+ const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+ if (STI.dumpCode()) {
+ // Disassemble function name label to text.
+ DisasmLines.push_back(MF->getName().str() + ":");
+ DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
+ HexLines.push_back("");
+ }
AsmPrinter::EmitFunctionEntryLabel();
}
+void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+ const AMDGPUSubtarget &STI = MBB.getParent()->getSubtarget<AMDGPUSubtarget>();
+ if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) {
+ // Write a line for the basic block label if it is not only fallthrough.
+ DisasmLines.push_back(
+ (Twine("BB") + Twine(getFunctionNumber())
+ + "_" + Twine(MBB.getNumber()) + ":").str());
+ DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
+ HexLines.push_back("");
+ }
+ AsmPrinter::EmitBasicBlockStart(MBB);
+}
+
void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
// Group segment variables aren't emitted in HSA.
- if (AMDGPU::isGroupSegment(GV, AMDGPUASI))
+ if (AMDGPU::isGroupSegment(GV))
return;
AsmPrinter::EmitGlobalVariable(GV);
@@ -190,11 +257,32 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
return AsmPrinter::doFinalization(M);
}
+// For the amdpal OS type, read the amdgpu.pal.metadata supplied by the
+// frontend into our PALMetadataMap, ready for per-function modification. It
+// is a NamedMD containing an MDTuple containing a number of MDNodes each of
+// which is an integer value, and each two integer values forms a key=value
+// pair that we store as PALMetadataMap[key]=value in the map.
+void AMDGPUAsmPrinter::readPALMetadata(Module &M) {
+ auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
+ if (!NamedMD || !NamedMD->getNumOperands())
+ return;
+ auto Tuple = dyn_cast<MDTuple>(NamedMD->getOperand(0));
+ if (!Tuple)
+ return;
+ for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) {
+ auto Key = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I));
+ auto Val = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I + 1));
+ if (!Key || !Val)
+ continue;
+ PALMetadataMap[Key->getZExtValue()] = Val->getZExtValue();
+ }
+}
+
// Print comments that apply to both callable functions and entry points.
void AMDGPUAsmPrinter::emitCommonFunctionComments(
uint32_t NumVGPR,
uint32_t NumSGPR,
- uint32_t ScratchSize,
+ uint64_t ScratchSize,
uint64_t CodeSize) {
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
@@ -226,12 +314,14 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
getSIProgramInfo(CurrentProgramInfo, MF);
} else {
auto I = CallGraphResourceInfo.insert(
- std::make_pair(MF.getFunction(), SIFunctionResourceInfo()));
+ std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
SIFunctionResourceInfo &Info = I.first->second;
assert(I.second && "should only be called once per function");
Info = analyzeResourceUsage(MF);
}
+ if (STM.isAmdPalOS())
+ EmitPALMetadata(MF, CurrentProgramInfo);
if (!STM.isAmdHsaOS()) {
EmitProgramInfoSI(MF, CurrentProgramInfo);
}
@@ -253,7 +343,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
if (!MFI->isEntryFunction()) {
OutStreamer->emitRawComment(" Function info:", false);
- SIFunctionResourceInfo &Info = CallGraphResourceInfo[MF.getFunction()];
+ SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
emitCommonFunctionComments(
Info.NumVGPR,
Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()),
@@ -336,8 +426,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
for (size_t i = 0; i < DisasmLines.size(); ++i) {
- std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
- Comment += " ; " + HexLines[i] + "\n";
+ std::string Comment = "\n";
+ if (!HexLines[i].empty()) {
+ Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
+ Comment += " ; " + HexLines[i] + "\n";
+ }
OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
OutStreamer->EmitBytes(StringRef(Comment));
@@ -376,7 +469,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
unsigned RsrcReg;
if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
// Evergreen / Northern Islands
- switch (MF.getFunction()->getCallingConv()) {
+ switch (MF.getFunction().getCallingConv()) {
default: LLVM_FALLTHROUGH;
case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
@@ -385,7 +478,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
}
} else {
// R600 / R700
- switch (MF.getFunction()->getCallingConv()) {
+ switch (MF.getFunction().getCallingConv()) {
default: LLVM_FALLTHROUGH;
case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
@@ -400,7 +493,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
- if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
+ if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
}
@@ -500,29 +593,184 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
// If there are no calls, MachineRegisterInfo can tell us the used register
// count easily.
+ // A tail call isn't considered a call for MachineFrameInfo's purposes.
+ if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
+ MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestVGPRReg = Reg;
+ break;
+ }
+ }
- MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
- for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
- if (MRI.isPhysRegUsed(Reg)) {
- HighestVGPRReg = Reg;
- break;
+ MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestSGPRReg = Reg;
+ break;
+ }
}
+
+ // We found the maximum register index. They start at 0, so add one to get the
+ // number of registers.
+ Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
+ TRI.getHWRegIndex(HighestVGPRReg) + 1;
+ Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
+ TRI.getHWRegIndex(HighestSGPRReg) + 1;
+
+ return Info;
}
- MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
- for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
- if (MRI.isPhysRegUsed(Reg)) {
- HighestSGPRReg = Reg;
- break;
+ int32_t MaxVGPR = -1;
+ int32_t MaxSGPR = -1;
+ uint64_t CalleeFrameSize = 0;
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ // TODO: Check regmasks? Do they occur anywhere except calls?
+ for (const MachineOperand &MO : MI.operands()) {
+ unsigned Width = 0;
+ bool IsSGPR = false;
+
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg();
+ switch (Reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ case AMDGPU::EXEC_HI:
+ case AMDGPU::SCC:
+ case AMDGPU::M0:
+ case AMDGPU::SRC_SHARED_BASE:
+ case AMDGPU::SRC_SHARED_LIMIT:
+ case AMDGPU::SRC_PRIVATE_BASE:
+ case AMDGPU::SRC_PRIVATE_LIMIT:
+ continue;
+
+ case AMDGPU::NoRegister:
+ assert(MI.isDebugValue());
+ continue;
+
+ case AMDGPU::VCC:
+ case AMDGPU::VCC_LO:
+ case AMDGPU::VCC_HI:
+ Info.UsesVCC = true;
+ continue;
+
+ case AMDGPU::FLAT_SCR:
+ case AMDGPU::FLAT_SCR_LO:
+ case AMDGPU::FLAT_SCR_HI:
+ continue;
+
+ case AMDGPU::TBA:
+ case AMDGPU::TBA_LO:
+ case AMDGPU::TBA_HI:
+ case AMDGPU::TMA:
+ case AMDGPU::TMA_LO:
+ case AMDGPU::TMA_HI:
+ llvm_unreachable("trap handler registers should not be used");
+
+ default:
+ break;
+ }
+
+ if (AMDGPU::SReg_32RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 1;
+ } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 1;
+ } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 2;
+ } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 2;
+ } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 3;
+ } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 4;
+ } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 4;
+ } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 8;
+ } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 8;
+ } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 16;
+ } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 16;
+ } else {
+ llvm_unreachable("Unknown register class");
+ }
+ unsigned HWReg = TRI.getHWRegIndex(Reg);
+ int MaxUsed = HWReg + Width - 1;
+ if (IsSGPR) {
+ MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
+ } else {
+ MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
+ }
+ }
+
+ if (MI.isCall()) {
+ // Pseudo used just to encode the underlying global. Is there a better
+ // way to track this?
+
+ const MachineOperand *CalleeOp
+ = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
+ const Function *Callee = cast<Function>(CalleeOp->getGlobal());
+ if (Callee->isDeclaration()) {
+ // If this is a call to an external function, we can't do much. Make
+ // conservative guesses.
+
+ // 48 SGPRs - vcc, - flat_scr, -xnack
+ int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true,
+ ST.hasFlatAddressSpace());
+ MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
+ MaxVGPR = std::max(MaxVGPR, 23);
+
+ CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
+ Info.UsesVCC = true;
+ Info.UsesFlatScratch = ST.hasFlatAddressSpace();
+ Info.HasDynamicallySizedStack = true;
+ } else {
+ // We force CodeGen to run in SCC order, so the callee's register
+ // usage etc. should be the cumulative usage of all callees.
+ auto I = CallGraphResourceInfo.find(Callee);
+ assert(I != CallGraphResourceInfo.end() &&
+ "callee should have been handled before caller");
+
+ MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
+ MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
+ CalleeFrameSize
+ = std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
+ Info.UsesVCC |= I->second.UsesVCC;
+ Info.UsesFlatScratch |= I->second.UsesFlatScratch;
+ Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
+ Info.HasRecursion |= I->second.HasRecursion;
+ }
+
+ if (!Callee->doesNotRecurse())
+ Info.HasRecursion = true;
+ }
}
}
- // We found the maximum register index. They start at 0, so add one to get the
- // number of registers.
- Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
- TRI.getHWRegIndex(HighestVGPRReg) + 1;
- Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
- TRI.getHWRegIndex(HighestSGPRReg) + 1;
+ Info.NumExplicitSGPR = MaxSGPR + 1;
+ Info.NumVGPR = MaxVGPR + 1;
+ Info.PrivateSegmentSize += CalleeFrameSize;
return Info;
}
@@ -538,6 +786,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.FlatUsed = Info.UsesFlatScratch;
ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
+ if (!isUInt<32>(ProgInfo.ScratchSize)) {
+ DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
+ ProgInfo.ScratchSize, DS_Error);
+ MF.getFunction().getContext().diagnose(DiagStackSize);
+ }
+
const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const SIInstrInfo *TII = STM.getInstrInfo();
@@ -554,8 +808,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
// This can happen due to a compiler bug or when using inline asm.
- LLVMContext &Ctx = MF.getFunction()->getContext();
- DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
+ LLVMContext &Ctx = MF.getFunction().getContext();
+ DiagnosticInfoResourceLimit Diag(MF.getFunction(),
"addressable scalar registers",
ProgInfo.NumSGPR, DS_Error,
DK_ResourceLimit,
@@ -582,8 +836,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
// This can happen due to a compiler bug or when using inline asm to use
// the registers which are usually reserved for vcc etc.
- LLVMContext &Ctx = MF.getFunction()->getContext();
- DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
+ LLVMContext &Ctx = MF.getFunction().getContext();
+ DiagnosticInfoResourceLimit Diag(MF.getFunction(),
"scalar registers",
ProgInfo.NumSGPR, DS_Error,
DK_ResourceLimit,
@@ -602,15 +856,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
}
if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
- LLVMContext &Ctx = MF.getFunction()->getContext();
- DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs",
+ LLVMContext &Ctx = MF.getFunction().getContext();
+ DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
MFI->getNumUserSGPRs(), DS_Error);
Ctx.diagnose(Diag);
}
if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
- LLVMContext &Ctx = MF.getFunction()->getContext();
- DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory",
+ LLVMContext &Ctx = MF.getFunction().getContext();
+ DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory",
MFI->getLDSSize(), DS_Error);
Ctx.diagnose(Diag);
}
@@ -710,10 +964,12 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
switch (CallConv) {
default: LLVM_FALLTHROUGH;
case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
+ case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
+ case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
- case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
+ case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
}
}
@@ -721,9 +977,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
const SIProgramInfo &CurrentProgramInfo) {
const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
+ unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
- if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
+ if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);
@@ -740,19 +996,24 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->EmitIntValue(RsrcReg, 4);
OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
- if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
+ unsigned Rsrc2Val = 0;
+ if (STM.isVGPRSpillingEnabled(MF.getFunction())) {
OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
+ if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
+ Rsrc2Val = S_00B84C_SCRATCH_EN(CurrentProgramInfo.ScratchBlocks > 0);
+ }
+ if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
+ OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
+ OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
+ OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
+ OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
+ Rsrc2Val |= S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks);
+ }
+ if (Rsrc2Val) {
+ OutStreamer->EmitIntValue(RsrcReg + 4 /*rsrc2*/, 4);
+ OutStreamer->EmitIntValue(Rsrc2Val, 4);
}
- }
-
- if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
- OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
- OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
- OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
- OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
- OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
- OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
}
OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
@@ -761,6 +1022,75 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
}
+// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
+// is AMDPAL. It stores each compute/SPI register setting and other PAL
+// metadata items into the PALMetadataMap, combining with any provided by the
+// frontend as LLVM metadata. Once all functions are written, PALMetadataMap is
+// then written as a single block in the .note section.
+void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
+ const SIProgramInfo &CurrentProgramInfo) {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ // Given the calling convention, calculate the register number for rsrc1. In
+ // principle the register number could change in future hardware, but we know
+ // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so
+ // we can use the same fixed value that .AMDGPU.config has for Mesa. Note
+ // that we use a register number rather than a byte offset, so we need to
+ // divide by 4.
+ unsigned Rsrc1Reg = getRsrcReg(MF.getFunction().getCallingConv()) / 4;
+ unsigned Rsrc2Reg = Rsrc1Reg + 1;
+ // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used
+ // with a constant offset to access any non-register shader-specific PAL
+ // metadata key.
+ unsigned ScratchSizeKey = PALMD::Key::CS_SCRATCH_SIZE;
+ switch (MF.getFunction().getCallingConv()) {
+ case CallingConv::AMDGPU_PS:
+ ScratchSizeKey = PALMD::Key::PS_SCRATCH_SIZE;
+ break;
+ case CallingConv::AMDGPU_VS:
+ ScratchSizeKey = PALMD::Key::VS_SCRATCH_SIZE;
+ break;
+ case CallingConv::AMDGPU_GS:
+ ScratchSizeKey = PALMD::Key::GS_SCRATCH_SIZE;
+ break;
+ case CallingConv::AMDGPU_ES:
+ ScratchSizeKey = PALMD::Key::ES_SCRATCH_SIZE;
+ break;
+ case CallingConv::AMDGPU_HS:
+ ScratchSizeKey = PALMD::Key::HS_SCRATCH_SIZE;
+ break;
+ case CallingConv::AMDGPU_LS:
+ ScratchSizeKey = PALMD::Key::LS_SCRATCH_SIZE;
+ break;
+ }
+ unsigned NumUsedVgprsKey = ScratchSizeKey +
+ PALMD::Key::VS_NUM_USED_VGPRS - PALMD::Key::VS_SCRATCH_SIZE;
+ unsigned NumUsedSgprsKey = ScratchSizeKey +
+ PALMD::Key::VS_NUM_USED_SGPRS - PALMD::Key::VS_SCRATCH_SIZE;
+ PALMetadataMap[NumUsedVgprsKey] = CurrentProgramInfo.NumVGPRsForWavesPerEU;
+ PALMetadataMap[NumUsedSgprsKey] = CurrentProgramInfo.NumSGPRsForWavesPerEU;
+ if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
+ PALMetadataMap[Rsrc1Reg] |= CurrentProgramInfo.ComputePGMRSrc1;
+ PALMetadataMap[Rsrc2Reg] |= CurrentProgramInfo.ComputePGMRSrc2;
+ // ScratchSize is in bytes, 16 aligned.
+ PALMetadataMap[ScratchSizeKey] |=
+ alignTo(CurrentProgramInfo.ScratchSize, 16);
+ } else {
+ PALMetadataMap[Rsrc1Reg] |= S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
+ S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks);
+ if (CurrentProgramInfo.ScratchBlocks > 0)
+ PALMetadataMap[Rsrc2Reg] |= S_00B84C_SCRATCH_EN(1);
+ // ScratchSize is in bytes, 16 aligned.
+ PALMetadataMap[ScratchSizeKey] |=
+ alignTo(CurrentProgramInfo.ScratchSize, 16);
+ }
+ if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
+ PALMetadataMap[Rsrc2Reg] |=
+ S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks);
+ PALMetadataMap[R_0286CC_SPI_PS_INPUT_ENA / 4] |= MFI->getPSInputEnable();
+ PALMetadataMap[R_0286D0_SPI_PS_INPUT_ADDR / 4] |= MFI->getPSInputAddr();
+ }
+}
+
// This is supposed to be log2(Size)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
switch (Size) {
@@ -862,23 +1192,81 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
}
}
+AMDGPU::HSAMD::Kernel::CodeProps::Metadata AMDGPUAsmPrinter::getHSACodeProps(
+ const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const {
+ const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
+
+ HSACodeProps.mKernargSegmentSize =
+ STM.getKernArgSegmentSize(MF, MFI.getABIArgOffset());
+ HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
+ HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
+ HSACodeProps.mKernargSegmentAlign =
+ std::max(uint32_t(4), MFI.getMaxKernArgAlign());
+ HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
+ HSACodeProps.mNumSGPRs = CurrentProgramInfo.NumSGPR;
+ HSACodeProps.mNumVGPRs = CurrentProgramInfo.NumVGPR;
+ HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize();
+ HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack;
+ HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled();
+ HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs();
+ HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs();
+
+ return HSACodeProps;
+}
+
+AMDGPU::HSAMD::Kernel::DebugProps::Metadata AMDGPUAsmPrinter::getHSADebugProps(
+ const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const {
+ const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
+
+ if (!STM.debuggerSupported())
+ return HSADebugProps;
+
+ HSADebugProps.mDebuggerABIVersion.push_back(1);
+ HSADebugProps.mDebuggerABIVersion.push_back(0);
+ HSADebugProps.mReservedNumVGPRs = ProgramInfo.ReservedVGPRCount;
+ HSADebugProps.mReservedFirstVGPR = ProgramInfo.ReservedVGPRFirst;
+
+ if (STM.debuggerEmitPrologue()) {
+ HSADebugProps.mPrivateSegmentBufferSGPR =
+ ProgramInfo.DebuggerPrivateSegmentBufferSGPR;
+ HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR =
+ ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+ }
+
+ return HSADebugProps;
+}
+
bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
unsigned AsmVariant,
const char *ExtraCode, raw_ostream &O) {
+ // First try the generic code, which knows about modifiers like 'c' and 'n'.
+ if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O))
+ return false;
+
if (ExtraCode && ExtraCode[0]) {
if (ExtraCode[1] != 0)
return true; // Unknown modifier.
switch (ExtraCode[0]) {
- default:
- // See if this is a generic print operand
- return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
case 'r':
break;
+ default:
+ return true;
}
}
- AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O,
- *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
- return false;
+ // TODO: Should be able to support other operand types like globals.
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ if (MO.isReg()) {
+ AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
+ *MF->getSubtarget().getRegisterInfo());
+ return false;
+ }
+
+ return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 0a58ce06704d..51d48a0c7320 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -17,6 +17,7 @@
#include "AMDGPU.h"
#include "AMDKernelCodeT.h"
+#include "MCTargetDesc/AMDGPUHSAMetadataStreamer.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include <cstddef>
@@ -40,7 +41,7 @@ private:
// the end are tracked separately.
int32_t NumVGPR = 0;
int32_t NumExplicitSGPR = 0;
- uint32_t PrivateSegmentSize = 0;
+ uint64_t PrivateSegmentSize = 0;
bool UsesVCC = false;
bool UsesFlatScratch = false;
bool HasDynamicallySizedStack = false;
@@ -60,7 +61,7 @@ private:
uint32_t DX10Clamp = 0;
uint32_t DebugMode = 0;
uint32_t IEEEMode = 0;
- uint32_t ScratchSize = 0;
+ uint64_t ScratchSize = 0;
uint64_t ComputePGMRSrc1 = 0;
@@ -113,9 +114,13 @@ private:
SIProgramInfo CurrentProgramInfo;
DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
+ AMDGPU::HSAMD::MetadataStreamer HSAMetadataStream;
+ std::map<uint32_t, uint32_t> PALMetadataMap;
+
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const;
+ void readPALMetadata(Module &M);
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
const MachineFunction &MF) const;
@@ -123,13 +128,23 @@ private:
unsigned &NumSGPR,
unsigned &NumVGPR) const;
+ AMDGPU::HSAMD::Kernel::CodeProps::Metadata getHSACodeProps(
+ const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const;
+ AMDGPU::HSAMD::Kernel::DebugProps::Metadata getHSADebugProps(
+ const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const;
+
/// \brief Emit register usage information so that the GPU driver
/// can correctly setup the GPU state.
void EmitProgramInfoR600(const MachineFunction &MF);
- void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
+ void EmitProgramInfoSI(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo);
+ void EmitPALMetadata(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo);
void emitCommonFunctionComments(uint32_t NumVGPR,
uint32_t NumSGPR,
- uint32_t ScratchSize,
+ uint64_t ScratchSize,
uint64_t CodeSize);
public:
@@ -140,7 +155,7 @@ public:
const MCSubtargetInfo* getSTI() const;
- AMDGPUTargetStreamer& getTargetStreamer() const;
+ AMDGPUTargetStreamer* getTargetStreamer() const;
bool doFinalization(Module &M) override;
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -166,6 +181,8 @@ public:
void EmitFunctionEntryLabel() override;
+ void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
+
void EmitGlobalVariable(const GlobalVariable *GV) override;
void EmitStartOfAsmFile(Module &M) override;
@@ -180,8 +197,8 @@ public:
raw_ostream &O) override;
protected:
- std::vector<std::string> DisasmLines, HexLines;
- size_t DisasmLineMaxLen;
+ mutable std::vector<std::string> DisasmLines, HexLines;
+ mutable size_t DisasmLineMaxLen;
AMDGPUAS AMDGPUASI;
};
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 515cc07dd449..5a9138731934 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -26,10 +26,6 @@
using namespace llvm;
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-#error "This shouldn't be built without GISel"
-#endif
-
AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
: CallLowering(&TLI), AMDGPUASI(TLI.getAMDGPUAS()) {
}
@@ -45,15 +41,15 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
unsigned Offset) const {
MachineFunction &MF = MIRBuilder.getMF();
- const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MachineRegisterInfo &MRI = MF.getRegInfo();
- const Function &F = *MF.getFunction();
+ const Function &F = MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
LLT PtrType = getLLTForType(*PtrTy, DL);
unsigned DstReg = MRI.createGenericVirtualRegister(PtrType);
unsigned KernArgSegmentPtr =
- TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
@@ -68,7 +64,7 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
Type *ParamTy, unsigned Offset,
unsigned DstReg) const {
MachineFunction &MF = MIRBuilder.getMF();
- const Function &F = *MF.getFunction();
+ const Function &F = MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
@@ -144,18 +140,38 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
Function::const_arg_iterator CurOrigArg = F.arg_begin();
const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
- MVT ValVT = TLI.getValueType(DL, CurOrigArg->getType()).getSimpleVT();
+ EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType());
+
+ // We can only hanlde simple value types at the moment.
+ if (!ValEVT.isSimple())
+ return false;
+ MVT ValVT = ValEVT.getSimpleVT();
ISD::ArgFlagsTy Flags;
+ ArgInfo OrigArg{VRegs[i], CurOrigArg->getType()};
+ setArgFlags(OrigArg, i + 1, DL, F);
Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(),
/*IsVarArg=*/false);
bool Res =
- AssignFn(i, ValVT, ValVT, CCValAssign::Full, Flags, CCInfo);
- assert(!Res && "Call operand has unhandled type");
- (void)Res;
+ AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo);
+
+ // Fail if we don't know how to handle this type.
+ if (Res)
+ return false;
}
Function::const_arg_iterator Arg = F.arg_begin();
+
+ if (F.getCallingConv() == CallingConv::AMDGPU_VS) {
+ for (unsigned i = 0; i != NumArgs; ++i, ++Arg) {
+ CCValAssign &VA = ArgLocs[i];
+ MRI.addLiveIn(VA.getLocReg(), VRegs[i]);
+ MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
+ MIRBuilder.buildCopy(VRegs[i], VA.getLocReg());
+ }
+ return true;
+ }
+
for (unsigned i = 0; i != NumArgs; ++i, ++Arg) {
// FIXME: We should be getting DebugInfo from the arguments some how.
CCValAssign &VA = ArgLocs[i];
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 4bef7a89bfe3..c1c066fd1404 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -163,6 +163,10 @@ def CC_AMDGPU : CallingConv<[
"AMDGPUSubtarget::SOUTHERN_ISLANDS",
CCDelegateTo<CC_SI>>,
CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
+ CCDelegateTo<CC_AMDGPU_Func>>,
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() < "
"AMDGPUSubtarget::SOUTHERN_ISLANDS",
CCDelegateTo<CC_R600>>
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 31ee9206ae27..b17b67167666 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -18,6 +18,7 @@
#include "AMDGPUTargetMachine.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/Loads.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
@@ -53,6 +54,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
DivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;
bool HasUnsafeFPMath = false;
+ AMDGPUAS AMDGPUASI;
/// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
/// binary operation \p V.
@@ -123,6 +125,15 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
///
/// \returns True.
bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
+ /// \brief Widen a scalar load.
+ ///
+ /// \details \p Widen scalar load for uniform, small type loads from constant
+ // memory / to a full 32-bits and then truncate the input to allow a scalar
+ // load instead of a vector load.
+ //
+ /// \returns True.
+
+ bool canWidenScalarExtLoad(LoadInst &I) const;
public:
static char ID;
@@ -133,6 +144,7 @@ public:
bool visitInstruction(Instruction &I) { return false; }
bool visitBinaryOperator(BinaryOperator &I);
+ bool visitLoadInst(LoadInst &I);
bool visitICmpInst(ICmpInst &I);
bool visitSelectInst(SelectInst &I);
@@ -223,6 +235,16 @@ static bool promotedOpIsNUW(const Instruction &I) {
}
}
+bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
+ Type *Ty = I.getType();
+ const DataLayout &DL = Mod->getDataLayout();
+ int TySize = DL.getTypeSizeInBits(Ty);
+ unsigned Align = I.getAlignment() ?
+ I.getAlignment() : DL.getABITypeAlignment(Ty);
+
+ return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
+}
+
bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
assert(needsPromotionToI32(I.getType()) &&
"I does not need promotion to i32");
@@ -378,7 +400,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
return false;
FastMathFlags FMF = FPOp->getFastMathFlags();
- bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
+ bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
FMF.allowReciprocal();
// With UnsafeDiv node will be optimized to just rcp and mul.
@@ -443,6 +465,29 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
return Changed;
}
+bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
+ if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+ canWidenScalarExtLoad(I)) {
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *I32Ty = Builder.getInt32Ty();
+ Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
+ Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
+ Value *WidenLoad = Builder.CreateLoad(BitCast);
+
+ int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
+ Type *IntNTy = Builder.getIntNTy(TySize);
+ Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
+ Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
+ I.replaceAllUsesWith(ValOrig);
+ I.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
bool Changed = false;
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 8e187c7e56c1..91fe921bfeec 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -15,7 +15,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
@@ -33,10 +33,6 @@ public:
/// \returns The number of 32-bit sub-registers that are used when storing
/// values to the stack.
unsigned getStackWidth(const MachineFunction &MF) const;
-
- bool hasFP(const MachineFunction &MF) const override {
- return false;
- }
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index 5cb9036f4823..bf7deb500d1a 100644
--- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -11,10 +11,6 @@
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-#error "You shouldn't build this"
-#endif
-
namespace llvm {
namespace AMDGPU {
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index f235313e4853..f4776adb069c 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -13,10 +13,12 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUISelLowering.h" // For AMDGPUISD
#include "AMDGPUInstrInfo.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
#include "SIDefines.h"
#include "SIISelLowering.h"
#include "SIInstrInfo.h"
@@ -68,19 +70,30 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
// make the right decision when generating code for different targets.
const AMDGPUSubtarget *Subtarget;
AMDGPUAS AMDGPUASI;
+ bool EnableLateStructurizeCFG;
public:
- explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel)
- : SelectionDAGISel(TM, OptLevel){
- AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
+ explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
+ CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
+ : SelectionDAGISel(*TM, OptLevel) {
+ AMDGPUASI = AMDGPU::getAMDGPUAS(*TM);
+ EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
}
~AMDGPUDAGToDAGISel() override = default;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AMDGPUArgumentUsageInfo>();
+ SelectionDAGISel::getAnalysisUsage(AU);
+ }
+
bool runOnMachineFunction(MachineFunction &MF) override;
void Select(SDNode *N) override;
StringRef getPassName() const override;
void PostprocessISelDAG() override;
+protected:
+ void SelectBuildVector(SDNode *N, unsigned RegClassID);
+
private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
bool isNoNanSrc(SDValue N) const;
@@ -99,8 +112,8 @@ private:
bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
SDValue& Offset);
- bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
- bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
+ virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+ virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
unsigned OffsetBits) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
@@ -116,10 +129,10 @@ private:
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
SDValue &SLC) const;
- bool SelectMUBUFScratchOffen(SDNode *Root,
+ bool SelectMUBUFScratchOffen(SDNode *Parent,
SDValue Addr, SDValue &RSrc, SDValue &VAddr,
SDValue &SOffset, SDValue &ImmOffset) const;
- bool SelectMUBUFScratchOffset(SDNode *Root,
+ bool SelectMUBUFScratchOffset(SDNode *Parent,
SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
@@ -140,6 +153,10 @@ private:
bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr,
SDValue &Offset, SDValue &SLC) const;
+ bool SelectFlatAtomicSigned(SDValue Addr, SDValue &VAddr,
+ SDValue &Offset, SDValue &SLC) const;
+
+ template <bool IsSigned>
bool SelectFlatOffset(SDValue Addr, SDValue &VAddr,
SDValue &Offset, SDValue &SLC) const;
@@ -152,10 +169,10 @@ private:
bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
- bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const;
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
@@ -174,9 +191,22 @@ private:
bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
SDValue &Clamp) const;
+ bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Clamp) const;
+
+ bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Clamp) const;
+ bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
+ bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+
+ bool SelectHi16Elt(SDValue In, SDValue &Src) const;
+
void SelectADD_SUB_I64(SDNode *N);
void SelectUADDO_USUBO(SDNode *N);
void SelectDIV_SCALE(SDNode *N);
+ void SelectMAD_64_32(SDNode *N);
void SelectFMA_W_CHAIN(SDNode *N);
void SelectFMUL_W_CHAIN(SDNode *N);
@@ -186,21 +216,49 @@ private:
void SelectS_BFE(SDNode *N);
bool isCBranchSCC(const SDNode *N) const;
void SelectBRCOND(SDNode *N);
+ void SelectFMAD(SDNode *N);
void SelectATOMIC_CMP_SWAP(SDNode *N);
+protected:
// Include the pieces autogenerated from the target description.
#include "AMDGPUGenDAGISel.inc"
};
+class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
+public:
+ explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) :
+ AMDGPUDAGToDAGISel(TM, OptLevel) {}
+
+ void Select(SDNode *N) override;
+
+ bool SelectADDRIndirect(SDValue Addr, SDValue &Base,
+ SDValue &Offset) override;
+ bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+ SDValue &Offset) override;
+};
+
} // end anonymous namespace
+INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel",
+ "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
+INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel",
+ "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
+
/// \brief This pass converts a legalized DAG into a AMDGPU-specific
// DAG, ready for instruction scheduling.
-FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
CodeGenOpt::Level OptLevel) {
return new AMDGPUDAGToDAGISel(TM, OptLevel);
}
+/// \brief This pass converts a legalized DAG into a R600-specific
+// DAG, ready for instruction scheduling.
+FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
+ CodeGenOpt::Level OptLevel) {
+ return new R600DAGToDAGISel(TM, OptLevel);
+}
+
bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
Subtarget = &MF.getSubtarget<AMDGPUSubtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
@@ -279,8 +337,8 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
}
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
- if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
- cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS)
+ if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS ||
+ !Subtarget->ldsRequiresM0Init())
return N;
const SITargetLowering& Lowering =
@@ -298,9 +356,7 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
Ops.push_back(N->getOperand(i));
}
Ops.push_back(Glue);
- CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
-
- return N;
+ return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
}
static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
@@ -334,6 +390,58 @@ static bool getConstantValue(SDValue N, uint32_t &Out) {
return false;
}
+void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
+ EVT VT = N->getValueType(0);
+ unsigned NumVectorElts = VT.getVectorNumElements();
+ EVT EltVT = VT.getVectorElementType();
+ const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+ SDLoc DL(N);
+ SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+
+ if (NumVectorElts == 1) {
+ CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
+ RegClass);
+ return;
+ }
+
+ assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+ "supported yet");
+ // 16 = Max Num Vector Elements
+ // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
+ // 1 = Vector Register Class
+ SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
+
+ RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+ bool IsRegSeq = true;
+ unsigned NOps = N->getNumOperands();
+ for (unsigned i = 0; i < NOps; i++) {
+ // XXX: Why is this here?
+ if (isa<RegisterSDNode>(N->getOperand(i))) {
+ IsRegSeq = false;
+ break;
+ }
+ RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
+ RegSeqArgs[1 + (2 * i) + 1] =
+ CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
+ MVT::i32);
+ }
+ if (NOps != NumVectorElts) {
+ // Fill in the missing undef elements if this was a scalar_to_vector.
+ assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
+ MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ DL, EltVT);
+ for (unsigned i = NOps; i < NumVectorElts; ++i) {
+ RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
+ RegSeqArgs[1 + (2 * i) + 1] =
+ CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
+ }
+ }
+
+ if (!IsRegSeq)
+ SelectCode(N);
+ CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
+}
+
void AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
@@ -346,18 +454,16 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
N = glueCopyToM0(N);
switch (Opc) {
- default: break;
+ default:
+ break;
// We are selecting i64 ADD here instead of custom lower it during
// DAG legalization, so we can fold some i64 ADDs used for address
// calculation into the LOAD and STORE instructions.
- case ISD::ADD:
case ISD::ADDC:
case ISD::ADDE:
- case ISD::SUB:
case ISD::SUBC:
case ISD::SUBE: {
- if (N->getValueType(0) != MVT::i64 ||
- Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ if (N->getValueType(0) != MVT::i64)
break;
SelectADD_SUB_I64(N);
@@ -378,13 +484,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
case ISD::SCALAR_TO_VECTOR:
- case AMDGPUISD::BUILD_VERTICAL_VECTOR:
case ISD::BUILD_VECTOR: {
- unsigned RegClassID;
- const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
- EVT EltVT = VT.getVectorElementType();
if (VT == MVT::v2i16 || VT == MVT::v2f16) {
if (Opc == ISD::BUILD_VECTOR) {
@@ -401,81 +503,13 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
break;
}
- assert(EltVT.bitsEq(MVT::i32));
-
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
- } else {
- // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
- // that adds a 128 bits reg copy when going through TwoAddressInstructions
- // pass. We want to avoid 128 bits copies as much as possible because they
- // can't be bundled by our scheduler.
- switch(NumVectorElts) {
- case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
- case 4:
- if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
- RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
- else
- RegClassID = AMDGPU::R600_Reg128RegClassID;
- break;
- default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
- }
- }
-
- SDLoc DL(N);
- SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
-
- if (NumVectorElts == 1) {
- CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
- RegClass);
- return;
- }
-
- assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
- "supported yet");
- // 16 = Max Num Vector Elements
- // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
- // 1 = Vector Register Class
- SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
-
- RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
- bool IsRegSeq = true;
- unsigned NOps = N->getNumOperands();
- for (unsigned i = 0; i < NOps; i++) {
- // XXX: Why is this here?
- if (isa<RegisterSDNode>(N->getOperand(i))) {
- IsRegSeq = false;
- break;
- }
- RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
- RegSeqArgs[1 + (2 * i) + 1] =
- CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
- MVT::i32);
- }
-
- if (NOps != NumVectorElts) {
- // Fill in the missing undef elements if this was a scalar_to_vector.
- assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
-
- MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
- DL, EltVT);
- for (unsigned i = NOps; i < NumVectorElts; ++i) {
- RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
- RegSeqArgs[1 + (2 * i) + 1] =
- CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
- }
- }
-
- if (!IsRegSeq)
- break;
- CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
+ assert(VT.getVectorElementType().bitsEq(MVT::i32));
+ unsigned RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
+ SelectBuildVector(N, RegClassID);
return;
}
case ISD::BUILD_PAIR: {
SDValue RC, SubReg0, SubReg1;
- if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
- break;
- }
SDLoc DL(N);
if (N->getValueType(0) == MVT::i128) {
RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32);
@@ -497,8 +531,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::Constant:
case ISD::ConstantFP: {
- if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
- N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+ if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
break;
uint64_t Imm;
@@ -533,9 +566,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
- if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
- break;
-
// There is a scalar version available, but unlike the vector version which
// has a separate operand for the offset and width, the scalar version packs
// the width and offset into a single operand. Try to move to the scalar
@@ -565,6 +595,11 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectDIV_SCALE(N);
return;
}
+ case AMDGPUISD::MAD_I64_I32:
+ case AMDGPUISD::MAD_U64_U32: {
+ SelectMAD_64_32(N);
+ return;
+ }
case ISD::CopyToReg: {
const SITargetLowering& Lowering =
*static_cast<const SITargetLowering*>(getTargetLowering());
@@ -575,8 +610,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::SRL:
case ISD::SRA:
case ISD::SIGN_EXTEND_INREG:
- if (N->getValueType(0) != MVT::i32 ||
- Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ if (N->getValueType(0) != MVT::i32)
break;
SelectS_BFE(N);
@@ -584,7 +618,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::BRCOND:
SelectBRCOND(N);
return;
-
+ case ISD::FMAD:
+ SelectFMAD(N);
+ return;
case AMDGPUISD::ATOMIC_CMP_SWAP:
SelectATOMIC_CMP_SWAP(N);
return;
@@ -638,32 +674,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
}
bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
- SDValue &Offset) {
- ConstantSDNode *IMMOffset;
-
- if (Addr.getOpcode() == ISD::ADD
- && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
- && isInt<16>(IMMOffset->getZExtValue())) {
-
- Base = Addr.getOperand(0);
- Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
- MVT::i32);
- return true;
- // If the pointer address is constant, we can move it to the offset field.
- } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
- && isInt<16>(IMMOffset->getZExtValue())) {
- Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
- SDLoc(CurDAG->getEntryNode()),
- AMDGPU::ZERO, MVT::i32);
- Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
- MVT::i32);
- return true;
- }
-
- // Default case, no offset
- Base = Addr;
- Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
- return true;
+ SDValue &Offset) {
+ return false;
}
bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
@@ -690,6 +702,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
return true;
}
+// FIXME: Should only handle addcarry/subcarry
void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
SDLoc DL(N);
SDValue LHS = N->getOperand(0);
@@ -699,8 +712,7 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
bool ProduceCarry =
ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
- bool IsAdd =
- (Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE);
+ bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
@@ -782,7 +794,7 @@ void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
SDLoc SL(N);
- // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
+ // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
SDValue Ops[8];
SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
@@ -808,6 +820,19 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
+ SDLoc SL(N);
+ bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
+ unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32;
+
+ SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ Clamp };
+ CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+}
+
bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
unsigned OffsetBits) const {
if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
@@ -850,8 +875,12 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
Zero, Addr.getOperand(1));
if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+ // FIXME: Select to VOP3 version for with-carry.
+ unsigned SubOp = Subtarget->hasAddNoCarry() ?
+ AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
+
MachineSDNode *MachineSub
- = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+ = CurDAG->getMachineNode(SubOp, DL, MVT::i32,
Zero, Addr.getOperand(1));
Base = SDValue(MachineSub, 0);
@@ -920,8 +949,11 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
Zero, Addr.getOperand(1));
if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
+ unsigned SubOp = Subtarget->hasAddNoCarry() ?
+ AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
+
MachineSDNode *MachineSub
- = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+ = CurDAG->getMachineNode(SubOp, DL, MVT::i32,
Zero, Addr.getOperand(1));
Base = SDValue(MachineSub, 0);
@@ -958,14 +990,6 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
return true;
}
-static bool isLegalMUBUFImmOffset(unsigned Imm) {
- return isUInt<12>(Imm);
-}
-
-static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
- return isLegalMUBUFImmOffset(Imm->getZExtValue());
-}
-
bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDValue &VAddr, SDValue &SOffset,
SDValue &Offset, SDValue &Offen,
@@ -1007,7 +1031,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
Ptr = N0;
}
- if (isLegalMUBUFImmOffset(C1)) {
+ if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
return true;
}
@@ -1104,7 +1128,7 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const
MVT::i32));
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
SDValue Addr, SDValue &Rsrc,
SDValue &VAddr, SDValue &SOffset,
SDValue &ImmOffset) const {
@@ -1117,8 +1141,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
unsigned Imm = CAddr->getZExtValue();
- assert(!isLegalMUBUFImmOffset(Imm) &&
- "should have been selected by other pattern");
SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
@@ -1127,7 +1149,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
// In a call sequence, stores to the argument stack area are relative to the
// stack pointer.
- const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo();
+ const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
@@ -1142,9 +1164,25 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
- // Offsets in vaddr must be positive.
+ // Offsets in vaddr must be positive if range checking is enabled.
+ //
+ // The total computation of vaddr + soffset + offset must not overflow. If
+ // vaddr is negative, even if offset is 0 the sgpr offset add will end up
+ // overflowing.
+ //
+ // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
+ // always perform a range check. If a negative vaddr base index was used,
+ // this would fail the range check. The overall address computation would
+ // compute a valid address, but this doesn't happen due to the range
+ // check. For out-of-bounds MUBUF loads, a 0 is returned.
+ //
+ // Therefore it should be safe to fold any VGPR offset on gfx9 into the
+ // MUBUF vaddr, but not on older subtargets which can only do this if the
+ // sign bit is known 0.
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
- if (isLegalMUBUFImmOffset(C1)) {
+ if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
+ (!Subtarget->privateMemoryResourceIsRangeChecked() ||
+ CurDAG->SignBitIsZero(N0))) {
std::tie(VAddr, SOffset) = foldFrameIndex(N0);
ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
return true;
@@ -1157,13 +1195,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
return true;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
SDValue Addr,
SDValue &SRsrc,
SDValue &SOffset,
SDValue &Offset) const {
ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr);
- if (!CAddr || !isLegalMUBUFImmOffset(CAddr))
+ if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
return false;
SDLoc DL(Addr);
@@ -1172,7 +1210,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root,
SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
- const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo();
+ const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
@@ -1231,24 +1269,30 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant,
SDValue &SOffset,
SDValue &ImmOffset) const {
SDLoc DL(Constant);
+ const uint32_t Align = 4;
+ const uint32_t MaxImm = alignDown(4095, Align);
uint32_t Imm = cast<ConstantSDNode>(Constant)->getZExtValue();
uint32_t Overflow = 0;
- if (Imm >= 4096) {
- if (Imm <= 4095 + 64) {
- // Use an SOffset inline constant for 1..64
- Overflow = Imm - 4095;
- Imm = 4095;
+ if (Imm > MaxImm) {
+ if (Imm <= MaxImm + 64) {
+ // Use an SOffset inline constant for 4..64
+ Overflow = Imm - MaxImm;
+ Imm = MaxImm;
} else {
// Try to keep the same value in SOffset for adjacent loads, so that
// the corresponding register contents can be re-used.
//
- // Load values with all low-bits set into SOffset, so that a larger
- // range of values can be covered using s_movk_i32
- uint32_t High = (Imm + 1) & ~4095;
- uint32_t Low = (Imm + 1) & 4095;
+ // Load values with all low-bits (except for alignment bits) set into
+ // SOffset, so that a larger range of values can be covered using
+ // s_movk_i32.
+ //
+ // Atomic operations fail to work correctly when individual address
+ // components are unaligned, even if their sum is aligned.
+ uint32_t High = (Imm + Align) & ~4095;
+ uint32_t Low = (Imm + Align) & 4095;
Imm = Low;
- Overflow = High - 1;
+ Overflow = High - Align;
}
}
@@ -1316,6 +1360,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
return true;
}
+template <bool IsSigned>
bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr,
SDValue &VAddr,
SDValue &Offset,
@@ -1326,8 +1371,10 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr,
CurDAG->isBaseWithConstantOffset(Addr)) {
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
- uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getZExtValue();
- if (isUInt<12>(COffsetVal)) {
+ int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+
+ if ((IsSigned && isInt<13>(COffsetVal)) ||
+ (!IsSigned && isUInt<12>(COffsetVal))) {
Addr = N0;
OffsetVal = COffsetVal;
}
@@ -1344,7 +1391,14 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDValue Addr,
SDValue &VAddr,
SDValue &Offset,
SDValue &SLC) const {
- return SelectFlatOffset(Addr, VAddr, Offset, SLC);
+ return SelectFlatOffset<false>(Addr, VAddr, Offset, SLC);
+}
+
+bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset,
+ SDValue &SLC) const {
+ return SelectFlatOffset<true>(Addr, VAddr, Offset, SLC);
}
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
@@ -1443,13 +1497,6 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
return !Imm && isa<ConstantSDNode>(Offset);
}
-bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
- SDValue &Offset) const {
- bool Imm;
- return SelectSMRDOffset(Addr, Offset, Imm) && !Imm &&
- !isa<ConstantSDNode>(Offset);
-}
-
bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
SDValue &Base,
SDValue &Offset) const {
@@ -1622,18 +1669,55 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
return;
}
- if (isCBranchSCC(N)) {
- // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it.
+ bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
+ unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
+ unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC;
+ SDLoc SL(N);
+
+ SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
+ CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
+ N->getOperand(2), // Basic Block
+ VCC.getValue(0));
+}
+
+void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
+ MVT VT = N->getSimpleValueType(0);
+ if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) {
SelectCode(N);
return;
}
- SDLoc SL(N);
+ SDValue Src0 = N->getOperand(0);
+ SDValue Src1 = N->getOperand(1);
+ SDValue Src2 = N->getOperand(2);
+ unsigned Src0Mods, Src1Mods, Src2Mods;
+
+ // Avoid using v_mad_mix_f32 unless there is actually an operand using the
+ // conversion from f16.
+ bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
+ bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
+ bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
+
+ assert(!Subtarget->hasFP32Denormals() &&
+ "fmad selected with denormals enabled");
+ // TODO: We can select this with f32 denormals enabled if all the sources are
+ // converted from f16 (in which case fmad isn't legal).
+
+ if (Sel0 || Sel1 || Sel2) {
+ // For dummy operands.
+ SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
+ SDValue Ops[] = {
+ CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0,
+ CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1,
+ CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2,
+ CurDAG->getTargetConstant(0, SDLoc(), MVT::i1),
+ Zero, Zero
+ };
- SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC, Cond);
- CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other,
- N->getOperand(2), // Basic Block
- VCC.getValue(0));
+ CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops);
+ } else {
+ SelectCode(N);
+ }
}
// This is here because there isn't a way to use the generated sub0_sub1 as the
@@ -1652,11 +1736,11 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
MachineSDNode *CmpSwap = nullptr;
if (Subtarget->hasAddr64()) {
- SDValue SRsrc, VAddr, SOffset, Offset, GLC, SLC;
+ SDValue SRsrc, VAddr, SOffset, Offset, SLC;
if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) {
- unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_ADDR64 :
- AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_ADDR64;
+ unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
+ AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
SDValue CmpVal = Mem->getOperand(2);
// XXX - Do we care about glue operands?
@@ -1672,8 +1756,8 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
if (!CmpSwap) {
SDValue SRsrc, SOffset, Offset, SLC;
if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) {
- unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET :
- AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_OFFSET;
+ unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
+ AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
SDValue CmpVal = Mem->getOperand(2);
SDValue Ops[] = {
@@ -1702,9 +1786,9 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
CurDAG->RemoveDeadNode(N);
}
-bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
- SDValue &SrcMods) const {
- unsigned Mods = 0;
+bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
+ unsigned &Mods) const {
+ Mods = 0;
Src = In;
if (Src.getOpcode() == ISD::FNEG) {
@@ -1717,10 +1801,20 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
Src = Src.getOperand(0);
}
- SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods;
+ if (SelectVOP3ModsImpl(In, Src, Mods)) {
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+ }
+
+ return false;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
SelectVOP3Mods(In, Src, SrcMods);
@@ -1864,24 +1958,234 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src,
return SelectVOP3PMods(In, Src, SrcMods);
}
+bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ Src = In;
+ // FIXME: Handle op_sel
+ SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In, SDValue &Src,
+ SDValue &SrcMods,
+ SDValue &Clamp) const {
+ SDLoc SL(In);
+
+ // FIXME: Handle clamp
+ Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
+
+ return SelectVOP3OpSel(In, Src, SrcMods);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ // FIXME: Handle op_sel
+ return SelectVOP3Mods(In, Src, SrcMods);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In, SDValue &Src,
+ SDValue &SrcMods,
+ SDValue &Clamp) const {
+ SDLoc SL(In);
+
+ // FIXME: Handle clamp
+ Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
+
+ return SelectVOP3OpSelMods(In, Src, SrcMods);
+}
+
+// The return value is not whether the match is possible (which it always is),
+// but whether or not it a conversion is really used.
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
+ unsigned &Mods) const {
+ Mods = 0;
+ SelectVOP3ModsImpl(In, Src, Mods);
+
+ if (Src.getOpcode() == ISD::FP_EXTEND) {
+ Src = Src.getOperand(0);
+ assert(Src.getValueType() == MVT::f16);
+ Src = stripBitcast(Src);
+
+ // Be careful about folding modifiers if we already have an abs. fneg is
+ // applied last, so we don't want to apply an earlier fneg.
+ if ((Mods & SISrcMods::ABS) == 0) {
+ unsigned ModsTmp;
+ SelectVOP3ModsImpl(Src, Src, ModsTmp);
+
+ if ((ModsTmp & SISrcMods::NEG) != 0)
+ Mods ^= SISrcMods::NEG;
+
+ if ((ModsTmp & SISrcMods::ABS) != 0)
+ Mods |= SISrcMods::ABS;
+ }
+
+ // op_sel/op_sel_hi decide the source type and source.
+ // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
+ // If the sources's op_sel is set, it picks the high half of the source
+ // register.
+
+ Mods |= SISrcMods::OP_SEL_1;
+ if (isExtractHiElt(Src, Src)) {
+ Mods |= SISrcMods::OP_SEL_0;
+
+ // TODO: Should we try to look for neg/abs here?
+ }
+
+ return true;
+ }
+
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ SelectVOP3PMadMixModsImpl(In, Src, Mods);
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+// TODO: Can we identify things like v_mad_mixhi_f16?
+bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
+ if (In.isUndef()) {
+ Src = In;
+ return true;
+ }
+
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
+ SDLoc SL(In);
+ SDValue K = CurDAG->getTargetConstant(C->getZExtValue() << 16, SL, MVT::i32);
+ MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+ SL, MVT::i32, K);
+ Src = SDValue(MovK, 0);
+ return true;
+ }
+
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
+ SDLoc SL(In);
+ SDValue K = CurDAG->getTargetConstant(
+ C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
+ MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+ SL, MVT::i32, K);
+ Src = SDValue(MovK, 0);
+ return true;
+ }
+
+ return isExtractHiElt(In, Src);
+}
+
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
const AMDGPUTargetLowering& Lowering =
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
bool IsModified = false;
do {
IsModified = false;
+
// Go over all selected nodes and try to fold them a bit more
- for (SDNode &Node : CurDAG->allnodes()) {
- MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
+ while (Position != CurDAG->allnodes_end()) {
+ SDNode *Node = &*Position++;
+ MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
if (!MachineNode)
continue;
SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
- if (ResNode != &Node) {
- ReplaceUses(&Node, ResNode);
+ if (ResNode != Node) {
+ if (ResNode)
+ ReplaceUses(Node, ResNode);
IsModified = true;
}
}
CurDAG->RemoveDeadNodes();
} while (IsModified);
}
+
+void R600DAGToDAGISel::Select(SDNode *N) {
+ unsigned int Opc = N->getOpcode();
+ if (N->isMachineOpcode()) {
+ N->setNodeId(-1);
+ return; // Already selected.
+ }
+
+ switch (Opc) {
+ default: break;
+ case AMDGPUISD::BUILD_VERTICAL_VECTOR:
+ case ISD::SCALAR_TO_VECTOR:
+ case ISD::BUILD_VECTOR: {
+ EVT VT = N->getValueType(0);
+ unsigned NumVectorElts = VT.getVectorNumElements();
+ unsigned RegClassID;
+ // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+ // that adds a 128 bits reg copy when going through TwoAddressInstructions
+ // pass. We want to avoid 128 bits copies as much as possible because they
+ // can't be bundled by our scheduler.
+ switch(NumVectorElts) {
+ case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+ case 4:
+ if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+ RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+ else
+ RegClassID = AMDGPU::R600_Reg128RegClassID;
+ break;
+ default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+ }
+ SelectBuildVector(N, RegClassID);
+ return;
+ }
+ }
+
+ SelectCode(N);
+}
+
+bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ ConstantSDNode *C;
+ SDLoc DL(Addr);
+
+ if ((C = dyn_cast<ConstantSDNode>(Addr))) {
+ Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
+ (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
+ Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+ (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+ Base = Addr.getOperand(0);
+ Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ } else {
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ }
+
+ return true;
+}
+
+bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ ConstantSDNode *IMMOffset;
+
+ if (Addr.getOpcode() == ISD::ADD
+ && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+ && isInt<16>(IMMOffset->getZExtValue())) {
+
+ Base = Addr.getOperand(0);
+ Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+ MVT::i32);
+ return true;
+ // If the pointer address is constant, we can move it to the offset field.
+ } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
+ && isInt<16>(IMMOffset->getZExtValue())) {
+ Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+ SDLoc(CurDAG->getEntryNode()),
+ AMDGPU::ZERO, MVT::i32);
+ Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+ MVT::i32);
+ return true;
+ }
+
+ // Default case, no offset
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+ return true;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 258b1737deb3..49929441ef21 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -13,6 +13,10 @@
//
//===----------------------------------------------------------------------===//
+#define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f
+#define AMDGPU_LN2_F 0.693147180559945309417232121458176568f
+#define AMDGPU_LN10_F 2.30258509299404568401799145468436421f
+
#include "AMDGPUISelLowering.h"
#include "AMDGPU.h"
#include "AMDGPUCallLowering.h"
@@ -20,6 +24,7 @@
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
#include "R600MachineFunctionInfo.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
@@ -127,27 +132,20 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
}
-bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op)
-{
- assert(Op.getOpcode() == ISD::OR);
-
- SDValue N0 = Op->getOperand(0);
- SDValue N1 = Op->getOperand(1);
- EVT VT = N0.getValueType();
-
- if (VT.isInteger() && !VT.isVector()) {
- KnownBits LHSKnown, RHSKnown;
- DAG.computeKnownBits(N0, LHSKnown);
+unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
+ KnownBits Known;
+ EVT VT = Op.getValueType();
+ DAG.computeKnownBits(Op, Known);
- if (LHSKnown.Zero.getBoolValue()) {
- DAG.computeKnownBits(N1, RHSKnown);
+ return VT.getSizeInBits() - Known.countMinLeadingZeros();
+}
- if (!(~RHSKnown.Zero & ~LHSKnown.Zero))
- return true;
- }
- }
+unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
- return false;
+ // In order for this to be a signed 24-bit value, bit 23, must
+ // be a sign bit.
+ return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
}
AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
@@ -323,6 +321,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FROUND, MVT::f32, Custom);
setOperationAction(ISD::FROUND, MVT::f64, Custom);
+ setOperationAction(ISD::FLOG, MVT::f32, Custom);
+ setOperationAction(ISD::FLOG10, MVT::f32, Custom);
+
+ if (Subtarget->has16BitInsts()) {
+ setOperationAction(ISD::FLOG, MVT::f16, Custom);
+ setOperationAction(ISD::FLOG10, MVT::f16, Custom);
+ }
+
setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
@@ -399,8 +405,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::i64, Expand);
setOperationAction(ISD::MULHU, MVT::i64, Expand);
setOperationAction(ISD::MULHS, MVT::i64, Expand);
- setOperationAction(ISD::UDIV, MVT::i32, Expand);
- setOperationAction(ISD::UREM, MVT::i32, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
@@ -416,8 +420,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
if (Subtarget->hasFFBL())
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+ setOperationAction(ISD::CTTZ, MVT::i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
setOperationAction(ISD::CTLZ, MVT::i64, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
@@ -475,6 +481,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+ setOperationAction(ISD::SETCC, VT, Expand);
}
static const MVT::SimpleValueType FloatVectorTypes[] = {
@@ -492,6 +499,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FEXP2, VT, Expand);
setOperationAction(ISD::FLOG2, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FFLOOR, VT, Expand);
setOperationAction(ISD::FTRUNC, VT, Expand);
@@ -507,6 +516,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+ setOperationAction(ISD::SETCC, VT, Expand);
}
// This causes using an unrolled select operation rather than expansion with
@@ -822,6 +832,17 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return isZExtFree(Val.getValueType(), VT2);
}
+// v_mad_mix* support a conversion from f16 to f32.
+//
+// There is only one special case when denormals are enabled we don't currently,
+// where this is OK to use.
+bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,
+ EVT DestVT, EVT SrcVT) const {
+ return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
+ DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
+ SrcVT.getScalarType() == MVT::f16;
+}
+
bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
// limited number of native 64-bit operations. Shrinking an operation to fit
@@ -847,9 +868,12 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_ES:
+ case CallingConv::AMDGPU_LS:
return CC_AMDGPU;
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::Cold:
return CC_AMDGPU_Func;
default:
report_fatal_error("Unsupported calling convention.");
@@ -867,9 +891,12 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_ES:
+ case CallingConv::AMDGPU_LS:
return RetCC_SI_Shader;
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::Cold:
return RetCC_AMDGPU_Func;
default:
report_fatal_error("Unsupported calling convention.");
@@ -1000,12 +1027,49 @@ CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
}
-SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
- SmallVectorImpl<SDValue> &InVals) const {
+SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
+ SelectionDAG &DAG,
+ MachineFrameInfo &MFI,
+ int ClobberedFI) const {
+ SmallVector<SDValue, 8> ArgChains;
+ int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
+ int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
+
+ // Include the original chain at the beginning of the list. When this is
+ // used by target LowerCall hooks, this helps legalize find the
+ // CALLSEQ_BEGIN node.
+ ArgChains.push_back(Chain);
+
+ // Add a chain value for each stack argument corresponding
+ for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
+ UE = DAG.getEntryNode().getNode()->use_end();
+ U != UE; ++U) {
+ if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
+ if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
+ if (FI->getIndex() < 0) {
+ int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
+ int64_t InLastByte = InFirstByte;
+ InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
+
+ if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
+ (FirstByte <= InFirstByte && InFirstByte <= LastByte))
+ ArgChains.push_back(SDValue(L, 1));
+ }
+ }
+ }
+ }
+
+ // Build a tokenfactor for all the chains.
+ return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
+}
+
+SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals,
+ StringRef Reason) const {
SDValue Callee = CLI.Callee;
SelectionDAG &DAG = CLI.DAG;
- const Function &Fn = *DAG.getMachineFunction().getFunction();
+ const Function &Fn = DAG.getMachineFunction().getFunction();
StringRef FuncName("<unknown>");
@@ -1015,7 +1079,7 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
FuncName = G->getGlobal()->getName();
DiagnosticInfoUnsupported NoCalls(
- Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
+ Fn, Reason + FuncName, CLI.DL.getDebugLoc());
DAG.getContext()->diagnose(NoCalls);
if (!CLI.IsTailCall) {
@@ -1026,9 +1090,14 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
return DAG.getEntryNode();
}
+SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
+}
+
SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
- const Function &Fn = *DAG.getMachineFunction().getFunction();
+ const Function &Fn = DAG.getMachineFunction().getFunction();
DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
SDLoc(Op).getDebugLoc());
@@ -1057,14 +1126,20 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
+ case ISD::FLOG:
+ return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
+ case ISD::FLOG10:
+ return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::CTTZ:
+ case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF:
- return LowerCTLZ(Op, DAG);
+ return LowerCTLZ_CTTZ(Op, DAG);
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
}
return Op;
@@ -1115,7 +1190,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
}
}
- const Function &Fn = *DAG.getMachineFunction().getFunction();
+ const Function &Fn = DAG.getMachineFunction().getFunction();
DiagnosticInfoUnsupported BadInit(
Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
DAG.getContext()->diagnose(BadInit);
@@ -1261,7 +1336,6 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
return scalarizeVectorLoad(Load, DAG);
SDValue BasePtr = Load->getBasePtr();
- EVT PtrVT = BasePtr.getValueType();
EVT MemVT = Load->getMemoryVT();
SDLoc SL(Op);
@@ -1282,8 +1356,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
Load->getChain(), BasePtr, SrcValue, LoMemVT,
BaseAlign, Load->getMemOperand()->getFlags());
- SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
- DAG.getConstant(Size, SL, PtrVT));
+ SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
SDValue HiLoad =
DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
@@ -1322,10 +1395,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
- EVT PtrVT = BasePtr.getValueType();
- SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
- DAG.getConstant(LoMemVT.getStoreSize(), SL,
- PtrVT));
+ SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
unsigned BaseAlign = Store->getAlignment();
@@ -1454,49 +1524,181 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results) const {
- assert(Op.getValueType() == MVT::i64);
-
SDLoc DL(Op);
EVT VT = Op.getValueType();
+
+ assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
+
EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
- SDValue one = DAG.getConstant(1, DL, HalfVT);
- SDValue zero = DAG.getConstant(0, DL, HalfVT);
+ SDValue One = DAG.getConstant(1, DL, HalfVT);
+ SDValue Zero = DAG.getConstant(0, DL, HalfVT);
//HiLo split
SDValue LHS = Op.getOperand(0);
- SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
- SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
+ SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
+ SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
SDValue RHS = Op.getOperand(1);
- SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
- SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
+ SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
+ SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
- if (VT == MVT::i64 &&
- DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
- DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
+ if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
+ DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
LHS_Lo, RHS_Lo);
- SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
- SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
+ SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
+ SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
return;
}
+ if (isTypeLegal(MVT::i64)) {
+ // Compute denominator reciprocal.
+ unsigned FMAD = Subtarget->hasFP32Denormals() ?
+ (unsigned)AMDGPUISD::FMAD_FTZ :
+ (unsigned)ISD::FMAD;
+
+ SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
+ SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
+ SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
+ DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
+ Cvt_Lo);
+ SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
+ SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
+ DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
+ SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
+ DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
+ SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
+ SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
+ DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
+ Mul1);
+ SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
+ SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
+ SDValue Rcp64 = DAG.getBitcast(VT,
+ DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
+
+ SDValue Zero64 = DAG.getConstant(0, DL, VT);
+ SDValue One64 = DAG.getConstant(1, DL, VT);
+ SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
+ SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
+
+ SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
+ SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
+ SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
+ SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
+ Zero);
+ SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
+ One);
+
+ SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
+ Mulhi1_Lo, Zero1);
+ SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
+ Mulhi1_Hi, Add1_Lo.getValue(1));
+ SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
+ SDValue Add1 = DAG.getBitcast(VT,
+ DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
+
+ SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
+ SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
+ SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
+ Zero);
+ SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
+ One);
+
+ SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
+ Mulhi2_Lo, Zero1);
+ SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
+ Mulhi2_Hi, Add1_Lo.getValue(1));
+ SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
+ Zero, Add2_Lo.getValue(1));
+ SDValue Add2 = DAG.getBitcast(VT,
+ DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
+ SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
+
+ SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
+
+ SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
+ SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
+ SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
+ Mul3_Lo, Zero1);
+ SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
+ Mul3_Hi, Sub1_Lo.getValue(1));
+ SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
+ SDValue Sub1 = DAG.getBitcast(VT,
+ DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
+
+ SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
+ SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
+ ISD::SETUGE);
+ SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
+ ISD::SETUGE);
+ SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
+
+ // TODO: Here and below portions of the code can be enclosed into if/endif.
+ // Currently control flow is unconditional and we have 4 selects after
+ // potential endif to substitute PHIs.
+
+ // if C3 != 0 ...
+ SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
+ RHS_Lo, Zero1);
+ SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
+ RHS_Hi, Sub1_Lo.getValue(1));
+ SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
+ Zero, Sub2_Lo.getValue(1));
+ SDValue Sub2 = DAG.getBitcast(VT,
+ DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
+
+ SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
+
+ SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
+ ISD::SETUGE);
+ SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
+ ISD::SETUGE);
+ SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
+
+ // if (C6 != 0)
+ SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
+
+ SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
+ RHS_Lo, Zero1);
+ SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
+ RHS_Hi, Sub2_Lo.getValue(1));
+ SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
+ Zero, Sub3_Lo.getValue(1));
+ SDValue Sub3 = DAG.getBitcast(VT,
+ DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
+
+ // endif C6
+ // endif C3
+
+ SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
+ SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
+
+ SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
+ SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
+
+ Results.push_back(Div);
+ Results.push_back(Rem);
+
+ return;
+ }
+
+ // r600 expandion.
// Get Speculative values
SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
- SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
- SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
+ SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
+ SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
- SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
- SDValue DIV_Lo = zero;
+ SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
+ SDValue DIV_Lo = Zero;
const unsigned halfBitWidth = HalfVT.getSizeInBits();
@@ -1505,7 +1707,7 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
// Get value of high bit
SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
- HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
+ HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
// Shift
@@ -1514,7 +1716,7 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
- SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
+ SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
@@ -1971,13 +2173,45 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
-SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
+ double Log2BaseInverted) const {
+ EVT VT = Op.getValueType();
+
+ SDLoc SL(Op);
+ SDValue Operand = Op.getOperand(0);
+ SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
+ SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
+
+ return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
+}
+
+static bool isCtlzOpc(unsigned Opc) {
+ return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
+}
+
+static bool isCttzOpc(unsigned Opc) {
+ return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
+}
+
+SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
- bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
+ bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
+ Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
+
+ unsigned ISDOpc, NewOpc;
+ if (isCtlzOpc(Op.getOpcode())) {
+ ISDOpc = ISD::CTLZ_ZERO_UNDEF;
+ NewOpc = AMDGPUISD::FFBH_U32;
+ } else if (isCttzOpc(Op.getOpcode())) {
+ ISDOpc = ISD::CTTZ_ZERO_UNDEF;
+ NewOpc = AMDGPUISD::FFBL_B32;
+ } else
+ llvm_unreachable("Unexpected OPCode!!!");
+
if (ZeroUndef && Src.getValueType() == MVT::i32)
- return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
+ return DAG.getNode(NewOpc, SL, MVT::i32, Src);
SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
@@ -1990,24 +2224,32 @@ SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), MVT::i32);
- SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
+ SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
+ SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
- SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
- SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
+ SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
+ SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
- SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
-
- // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
- SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
+ SDValue Add, NewOpr;
+ if (isCtlzOpc(Op.getOpcode())) {
+ Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
+ // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
+ NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
+ } else {
+ Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
+ // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
+ NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
+ }
if (!ZeroUndef) {
// Test if the full 64-bit input is zero.
// FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
// which we probably don't want.
- SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
- SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
+ SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
+ SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
+ SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
// TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
// with the same cycles, otherwise it is slower.
@@ -2018,11 +2260,11 @@ SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
// The instruction returns -1 for 0 input, but the defined intrinsic
// behavior is to return the number of bits.
- NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
- SrcIsZero, Bits32, NewCtlz);
+ NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
+ SrcIsZero, Bits32, NewOpr);
}
- return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
+ return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
}
SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
@@ -2389,21 +2631,14 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
//===----------------------------------------------------------------------===//
static bool isU24(SDValue Op, SelectionDAG &DAG) {
- KnownBits Known;
- EVT VT = Op.getValueType();
- DAG.computeKnownBits(Op, Known);
-
- return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24;
+ return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
}
static bool isI24(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
-
- // In order for this to be a signed 24-bit value, bit 23, must
- // be a sign bit.
return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
// as unsigned 24-bit values.
- (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
+ AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
}
static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
@@ -2665,11 +2900,21 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND:
case ISD::ANY_EXTEND: {
+ SDValue X = LHS->getOperand(0);
+
+ if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
+ isTypeLegal(MVT::v2i16)) {
+ // Prefer build_vector as the canonical form if packed types are legal.
+ // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
+ SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
+ { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+ }
+
// shl (ext x) => zext (shl x), if shift does not overflow int
if (VT != MVT::i64)
break;
KnownBits Known;
- SDValue X = LHS->getOperand(0);
DAG.computeKnownBits(X, Known);
unsigned LZ = Known.countMinLeadingZeros();
if (LZ < RHSVal)
@@ -2678,21 +2923,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
return DAG.getZExtOrTrunc(Shl, SL, VT);
}
- case ISD::OR:
- if (!isOrEquivalentToAdd(DAG, LHS))
- break;
- LLVM_FALLTHROUGH;
- case ISD::ADD: {
- // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)
- if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
- SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0),
- SDValue(RHS, 0));
- SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal,
- SDLoc(C2), VT);
- return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V);
- }
- break;
- }
}
if (VT != MVT::i64)
@@ -2924,13 +3154,10 @@ static bool isNegativeOne(SDValue Val) {
return false;
}
-static bool isCtlzOpc(unsigned Opc) {
- return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
-}
-
-SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
+SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
SDValue Op,
- const SDLoc &DL) const {
+ const SDLoc &DL,
+ unsigned Opc) const {
EVT VT = Op.getValueType();
EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
@@ -2940,11 +3167,11 @@ SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
if (VT != MVT::i32)
Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
- SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op);
+ SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
if (VT != MVT::i32)
- FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH);
+ FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
- return FFBH;
+ return FFBX;
}
// The native instructions return -1 on 0 input. Optimize out a select that
@@ -2954,7 +3181,7 @@ SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
// against the bitwidth.
//
// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
-SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
+SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
SDValue LHS, SDValue RHS,
DAGCombinerInfo &DCI) const {
ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
@@ -2965,20 +3192,25 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
SDValue CmpLHS = Cond.getOperand(0);
+ unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
+ AMDGPUISD::FFBH_U32;
+
// select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
+ // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
if (CCOpcode == ISD::SETEQ &&
- isCtlzOpc(RHS.getOpcode()) &&
+ (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
RHS.getOperand(0) == CmpLHS &&
isNegativeOne(LHS)) {
- return getFFBH_U32(DAG, CmpLHS, SL);
+ return getFFBX_U32(DAG, CmpLHS, SL, Opc);
}
// select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
+ // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
if (CCOpcode == ISD::SETNE &&
- isCtlzOpc(LHS.getOpcode()) &&
+ (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
LHS.getOperand(0) == CmpLHS &&
isNegativeOne(RHS)) {
- return getFFBH_U32(DAG, CmpLHS, SL);
+ return getFFBX_U32(DAG, CmpLHS, SL, Opc);
}
return SDValue();
@@ -3111,7 +3343,7 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
}
// There's no reason to not do this if the condition has other uses.
- return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
+ return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
}
static bool isConstantFPZero(SDValue N) {
@@ -3581,6 +3813,48 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
}
+SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
+ EVT VT,
+ const SDLoc &SL,
+ int64_t Offset) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
+ auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
+ SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
+
+ return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+}
+
+SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
+ const SDLoc &SL,
+ SDValue Chain,
+ SDValue StackPtr,
+ SDValue ArgVal,
+ int64_t Offset) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
+
+ SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset);
+ SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
+ MachineMemOperand::MODereferenceable);
+ return Store;
+}
+
+SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ EVT VT, const SDLoc &SL,
+ const ArgDescriptor &Arg) const {
+ assert(Arg && "Attempting to load missing argument");
+
+ if (Arg.isRegister())
+ return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
+ return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
+}
+
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
@@ -3608,6 +3882,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(ELSE)
NODE_NAME_CASE(LOOP)
NODE_NAME_CASE(CALL)
+ NODE_NAME_CASE(TC_RETURN)
NODE_NAME_CASE(TRAP)
NODE_NAME_CASE(RET_FLAG)
NODE_NAME_CASE(RETURN_TO_EPILOG)
@@ -3655,6 +3930,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BFM)
NODE_NAME_CASE(FFBH_U32)
NODE_NAME_CASE(FFBH_I32)
+ NODE_NAME_CASE(FFBL_B32)
NODE_NAME_CASE(MUL_U24)
NODE_NAME_CASE(MUL_I24)
NODE_NAME_CASE(MULHI_U24)
@@ -3663,6 +3939,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(MUL_LOHI_I24)
NODE_NAME_CASE(MAD_U24)
NODE_NAME_CASE(MAD_I24)
+ NODE_NAME_CASE(MAD_I64_I32)
+ NODE_NAME_CASE(MAD_U64_U32)
NODE_NAME_CASE(TEXTURE_FETCH)
NODE_NAME_CASE(EXPORT)
NODE_NAME_CASE(EXPORT_DONE)
@@ -3704,6 +3982,19 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(ATOMIC_DEC)
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
+ NODE_NAME_CASE(BUFFER_STORE)
+ NODE_NAME_CASE(BUFFER_STORE_FORMAT)
+ NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
+ NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
+ NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
+ NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
+ NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
+ NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
+ NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
+ NODE_NAME_CASE(BUFFER_ATOMIC_AND)
+ NODE_NAME_CASE(BUFFER_ATOMIC_OR)
+ NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
+ NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
return nullptr;
@@ -3754,7 +4045,6 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
Known.resetAll(); // Don't know anything.
- KnownBits Known2;
unsigned Opc = Op.getOpcode();
switch (Opc) {
@@ -3787,6 +4077,51 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
break;
}
+ case AMDGPUISD::MUL_U24:
+ case AMDGPUISD::MUL_I24: {
+ KnownBits LHSKnown, RHSKnown;
+ DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
+ DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
+
+ unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
+ RHSKnown.countMinTrailingZeros();
+ Known.Zero.setLowBits(std::min(TrailZ, 32u));
+
+ unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u);
+ unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u);
+ unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
+ if (MaxValBits >= 32)
+ break;
+ bool Negative = false;
+ if (Opc == AMDGPUISD::MUL_I24) {
+ bool LHSNegative = !!(LHSKnown.One & (1 << 23));
+ bool LHSPositive = !!(LHSKnown.Zero & (1 << 23));
+ bool RHSNegative = !!(RHSKnown.One & (1 << 23));
+ bool RHSPositive = !!(RHSKnown.Zero & (1 << 23));
+ if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive))
+ break;
+ Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative);
+ }
+ if (Negative)
+ Known.One.setHighBits(32 - MaxValBits);
+ else
+ Known.Zero.setHighBits(32 - MaxValBits);
+ break;
+ }
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (IID) {
+ case Intrinsic::amdgcn_mbcnt_lo:
+ case Intrinsic::amdgcn_mbcnt_hi: {
+ // These return at most the wavefront size - 1.
+ unsigned Size = Op.getValueType().getSizeInBits();
+ Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2());
+ break;
+ }
+ default:
+ break;
+ }
+ }
}
}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index d85aada6053a..3f8a9b1964ca 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -18,13 +18,13 @@
#include "AMDGPU.h"
#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/TargetLowering.h"
namespace llvm {
class AMDGPUMachineFunction;
class AMDGPUSubtarget;
-class MachineRegisterInfo;
+struct ArgDescriptor;
class AMDGPUTargetLowering : public TargetLowering {
private:
@@ -32,10 +32,11 @@ private:
/// legalized from a smaller type VT. Need to match pre-legalized type because
/// the generic legalization inserts the add/sub between the select and
/// compare.
- SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const;
+ SDValue getFFBX_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, unsigned Opc) const;
public:
- static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op);
+ static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG);
+ static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
protected:
const AMDGPUSubtarget *Subtarget;
@@ -56,8 +57,10 @@ protected:
SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFLOG(SDValue Op, SelectionDAG &Dag,
+ double Log2BaseInverted) const;
- SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
@@ -88,7 +91,7 @@ protected:
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
+ SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
SDValue RHS, DAGCombinerInfo &DCI) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -143,6 +146,7 @@ public:
bool isZExtFree(Type *Src, Type *Dest) const override;
bool isZExtFree(EVT Src, EVT Dest) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
+ bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override;
bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
@@ -171,6 +175,15 @@ public:
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
+
+ SDValue addTokenForArgument(SDValue Chain,
+ SelectionDAG &DAG,
+ MachineFrameInfo &MFI,
+ int ClobberedFI) const;
+
+ SDValue lowerUnhandledCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals,
+ StringRef Reason) const;
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
@@ -237,6 +250,25 @@ public:
return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
}
+ /// Similar to CreateLiveInRegister, except value maybe loaded from a stack
+ /// slot rather than passed in a register.
+ SDValue loadStackInputValue(SelectionDAG &DAG,
+ EVT VT,
+ const SDLoc &SL,
+ int64_t Offset) const;
+
+ SDValue storeStackInputValue(SelectionDAG &DAG,
+ const SDLoc &SL,
+ SDValue Chain,
+ SDValue StackPtr,
+ SDValue ArgVal,
+ int64_t Offset) const;
+
+ SDValue loadInputValue(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ EVT VT, const SDLoc &SL,
+ const ArgDescriptor &Arg) const;
+
enum ImplicitParameter {
FIRST_IMPLICIT,
GRID_DIM = FIRST_IMPLICIT,
@@ -268,6 +300,7 @@ enum NodeType : unsigned {
// Function call.
CALL,
+ TC_RETURN,
TRAP,
// Masked control flow nodes.
@@ -342,12 +375,15 @@ enum NodeType : unsigned {
BFM, // Insert a range of bits into a 32-bit word.
FFBH_U32, // ctlz with -1 if input is zero.
FFBH_I32,
+ FFBL_B32, // cttz with -1 if input is zero.
MUL_U24,
MUL_I24,
MULHI_U24,
MULHI_I24,
MAD_U24,
MAD_I24,
+ MAD_U64_U32,
+ MAD_I64_I32,
MUL_LOHI_I24,
MUL_LOHI_U24,
TEXTURE_FETCH,
@@ -411,6 +447,19 @@ enum NodeType : unsigned {
ATOMIC_DEC,
BUFFER_LOAD,
BUFFER_LOAD_FORMAT,
+ BUFFER_STORE,
+ BUFFER_STORE_FORMAT,
+ BUFFER_ATOMIC_SWAP,
+ BUFFER_ATOMIC_ADD,
+ BUFFER_ATOMIC_SUB,
+ BUFFER_ATOMIC_SMIN,
+ BUFFER_ATOMIC_UMIN,
+ BUFFER_ATOMIC_SMAX,
+ BUFFER_ATOMIC_UMAX,
+ BUFFER_ATOMIC_AND,
+ BUFFER_ATOMIC_OR,
+ BUFFER_ATOMIC_XOR,
+ BUFFER_ATOMIC_CMPSWAP,
LAST_AMDGPU_ISD_NUMBER
};
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
new file mode 100644
index 000000000000..ff9e7b50ed5c
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -0,0 +1,208 @@
+//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This is AMDGPU specific replacement of the standard inliner.
+/// The main purpose is to account for the fact that calls not only expensive
+/// on the AMDGPU, but much more expensive if a private memory pointer is
+/// passed to a function as an argument. In this situation, we are unable to
+/// eliminate private memory in the caller unless inlined and end up with slow
+/// and expensive scratch access. Thus, we boost the inline threshold for such
+/// functions here.
+///
+//===----------------------------------------------------------------------===//
+
+
+#include "AMDGPU.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/IPO/Inliner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inline"
+
+static cl::opt<int>
+ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200),
+ cl::desc("Cost of alloca argument"));
+
+// If the amount of scratch memory to eliminate exceeds our ability to allocate
+// it into registers we gain nothing by agressively inlining functions for that
+// heuristic.
+static cl::opt<unsigned>
+ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
+ cl::desc("Maximum alloca size to use for inline cost"));
+
+namespace {
+
+class AMDGPUInliner : public LegacyInlinerBase {
+
+public:
+ AMDGPUInliner() : LegacyInlinerBase(ID) {
+ initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry());
+ Params = getInlineParams();
+ }
+
+ static char ID; // Pass identification, replacement for typeid
+
+ unsigned getInlineThreshold(CallSite CS) const;
+
+ InlineCost getInlineCost(CallSite CS) override;
+
+ bool runOnSCC(CallGraphSCC &SCC) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+ TargetTransformInfoWrapperPass *TTIWP;
+
+ InlineParams Params;
+};
+
+} // end anonymous namespace
+
+char AMDGPUInliner::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline",
+ "AMDGPU Function Integration/Inlining", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline",
+ "AMDGPU Function Integration/Inlining", false, false)
+
+Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); }
+
+bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) {
+ TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
+ return LegacyInlinerBase::runOnSCC(SCC);
+}
+
+void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ LegacyInlinerBase::getAnalysisUsage(AU);
+}
+
+unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
+ int Thres = Params.DefaultThreshold;
+
+ Function *Caller = CS.getCaller();
+ // Listen to the inlinehint attribute when it would increase the threshold
+ // and the caller does not need to minimize its size.
+ Function *Callee = CS.getCalledFunction();
+ bool InlineHint = Callee && !Callee->isDeclaration() &&
+ Callee->hasFnAttribute(Attribute::InlineHint);
+ if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres
+ && !Caller->hasFnAttribute(Attribute::MinSize))
+ Thres = Params.HintThreshold.getValue();
+
+ const DataLayout &DL = Caller->getParent()->getDataLayout();
+ if (!Callee)
+ return (unsigned)Thres;
+
+ const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*Caller->getParent());
+
+ // If we have a pointer to private array passed into a function
+ // it will not be optimized out, leaving scratch usage.
+ // Increase the inline threshold to allow inliniting in this case.
+ uint64_t AllocaSize = 0;
+ SmallPtrSet<const AllocaInst *, 8> AIVisited;
+ for (Value *PtrArg : CS.args()) {
+ Type *Ty = PtrArg->getType();
+ if (!Ty->isPointerTy() ||
+ Ty->getPointerAddressSpace() != AS.PRIVATE_ADDRESS)
+ continue;
+ PtrArg = GetUnderlyingObject(PtrArg, DL);
+ if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
+ if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
+ continue;
+ AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
+ // If the amount of stack memory is excessive we will not be able
+ // to get rid of the scratch anyway, bail out.
+ if (AllocaSize > ArgAllocaCutoff) {
+ AllocaSize = 0;
+ break;
+ }
+ }
+ }
+ if (AllocaSize)
+ Thres += ArgAllocaCost;
+
+ return (unsigned)Thres;
+}
+
+// Check if call is just a wrapper around another call.
+// In this case we only have call and ret instructions.
+static bool isWrapperOnlyCall(CallSite CS) {
+ Function *Callee = CS.getCalledFunction();
+ if (!Callee || Callee->size() != 1)
+ return false;
+ const BasicBlock &BB = Callee->getEntryBlock();
+ if (const Instruction *I = BB.getFirstNonPHI()) {
+ if (!isa<CallInst>(I)) {
+ return false;
+ }
+ if (isa<ReturnInst>(*std::next(I->getIterator()))) {
+ DEBUG(dbgs() << " Wrapper only call detected: "
+ << Callee->getName() << '\n');
+ return true;
+ }
+ }
+ return false;
+}
+
+InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
+ Function *Callee = CS.getCalledFunction();
+ Function *Caller = CS.getCaller();
+ TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
+
+ if (!Callee || Callee->isDeclaration() || CS.isNoInline() ||
+ !TTI.areInlineCompatible(Caller, Callee))
+ return llvm::InlineCost::getNever();
+
+ if (CS.hasFnAttr(Attribute::AlwaysInline)) {
+ if (isInlineViable(*Callee))
+ return llvm::InlineCost::getAlways();
+ return llvm::InlineCost::getNever();
+ }
+
+ if (isWrapperOnlyCall(CS))
+ return llvm::InlineCost::getAlways();
+
+ InlineParams LocalParams = Params;
+ LocalParams.DefaultThreshold = (int)getInlineThreshold(CS);
+ bool RemarksEnabled = false;
+ const auto &BBs = Caller->getBasicBlockList();
+ if (!BBs.empty()) {
+ auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
+ if (DI.isEnabled())
+ RemarksEnabled = true;
+ }
+
+ OptimizationRemarkEmitter ORE(Caller);
+ std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+ [this](Function &F) -> AssumptionCache & {
+ return ACT->getAssumptionCache(F);
+ };
+
+ return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache,
+ None, PSI, RemarksEnabled ? &ORE : nullptr);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 69dc52986172..8156599528c2 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -23,14 +23,15 @@
using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
-#define GET_INSTRMAP_INFO
#include "AMDGPUGenInstrInfo.inc"
// Pin the vtable to this file.
void AMDGPUInstrInfo::anchor() {}
AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
- : AMDGPUGenInstrInfo(-1, -1), ST(ST), AMDGPUASI(ST.getAMDGPUAS()) {}
+ : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+ ST(ST),
+ AMDGPUASI(ST.getAMDGPUAS()) {}
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
// the first 16 loads will be interleaved with the stores, and the next 16 will
@@ -54,34 +55,15 @@ bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
}
-int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
- switch (Channels) {
- default: return Opcode;
- case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1);
- case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2);
- case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
- }
-}
-
// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
enum SIEncodingFamily {
SI = 0,
VI = 1,
SDWA = 2,
- SDWA9 = 3
+ SDWA9 = 3,
+ GFX9 = 4
};
-// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
-// header files, so we need to wrap it in a function that takes unsigned
-// instead.
-namespace llvm {
-namespace AMDGPU {
-static int getMCOpcode(uint16_t Opcode, unsigned Gen) {
- return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
-}
-}
-}
-
static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
switch (ST.getGeneration()) {
case AMDGPUSubtarget::SOUTHERN_ISLANDS:
@@ -104,6 +86,11 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+
+ if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
+ ST.getGeneration() >= AMDGPUSubtarget::GFX9)
+ Gen = SIEncodingFamily::GFX9;
+
if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
: SIEncodingFamily::SDWA;
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 41cc7d7093ec..a9fcd4834638 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -18,10 +18,11 @@
#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "AMDGPUGenInstrInfo.inc"
+#undef GET_INSTRINFO_HEADER
namespace llvm {
@@ -49,10 +50,6 @@ public:
/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.
int pseudoToMCOpcode(int Opcode) const;
-
- /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the
- /// equivalent opcode that writes \p Channels Channels.
- int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
};
} // End llvm namespace
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index bcf89bb78ad6..c024010f3e96 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -74,6 +74,8 @@ def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
[SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
>;
+def SDT_AMDGPUTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
+
//===----------------------------------------------------------------------===//
// AMDGPU DAG Nodes
//
@@ -82,6 +84,26 @@ def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
+def callseq_start : SDNode<"ISD::CALLSEQ_START",
+ SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
+ [SDNPHasChain, SDNPOutGlue]
+>;
+
+def callseq_end : SDNode<"ISD::CALLSEQ_END",
+ SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
+>;
+
+def AMDGPUcall : SDNode<"AMDGPUISD::CALL",
+ SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]
+>;
+
+def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", SDT_AMDGPUTCRET,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
[SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]
@@ -276,6 +298,8 @@ def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;
+def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>;
+
// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
// when performing the mulitply. The result is a 32-bit value.
def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index e54c887d6090..16d240e96196 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -402,7 +402,8 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
return Ret;
}
-bool AMDGPUInstructionSelector::select(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::select(MachineInstr &I,
+ CodeGenCoverage &CoverageInfo) const {
if (!isPreISelGenericOpcode(I.getOpcode()))
return true;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index ef845f44d365..715c4882f380 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -35,7 +35,8 @@ public:
AMDGPUInstructionSelector(const SISubtarget &STI,
const AMDGPURegisterBankInfo &RBI);
- bool select(MachineInstr &I) const override;
+ bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+
private:
struct GEPInfo {
const MachineInstr &GEP;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 4e688ab0b105..31f728b0c22f 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -42,10 +42,14 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm = "",
field bits<32> Inst = 0xffffffff;
}
-def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">;
-def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">;
-def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">;
+def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">;
+def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">;
+def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">;
+def NoFP16Denormals : Predicate<"!Subtarget->hasFP16Denormals()">;
+def NoFP32Denormals : Predicate<"!Subtarget->hasFP32Denormals()">;
+def NoFP64Denormals : Predicate<"!Subtarget->hasFP64Denormals()">;
def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
+def FMA : Predicate<"Subtarget->hasFMA()">;
def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
@@ -130,6 +134,29 @@ def shl_oneuse : HasOneUseBinOp<shl>;
def select_oneuse : HasOneUseTernaryOp<select>;
+def srl_16 : PatFrag<
+ (ops node:$src0), (srl_oneuse node:$src0, (i32 16))
+>;
+
+
+def hi_i16_elt : PatFrag<
+ (ops node:$src0), (i16 (trunc (i32 (srl_16 node:$src0))))
+>;
+
+
+def hi_f16_elt : PatLeaf<
+ (vt), [{
+ if (N->getOpcode() != ISD::BITCAST)
+ return false;
+ SDValue Tmp = N->getOperand(0);
+
+ if (Tmp.getOpcode() != ISD::SRL)
+ return false;
+ if (const auto *RHS = dyn_cast<ConstantSDNode>(Tmp.getOperand(1))
+ return RHS->getZExtValue() == 16;
+ return false;
+}]>;
+
//===----------------------------------------------------------------------===//
// PatLeafs for floating-point comparisons
//===----------------------------------------------------------------------===//
@@ -164,7 +191,6 @@ def COND_OLE : PatLeaf <
[{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
>;
-
def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
@@ -219,75 +245,53 @@ def COND_NULL : PatLeaf <
// Load/Store Pattern Fragments
//===----------------------------------------------------------------------===//
-class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS;
+class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
+ return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
}]>;
-class PrivateLoad <SDPatternOperator op> : PrivateMemOp <
- (ops node:$ptr), (op node:$ptr)
->;
+class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>;
-class PrivateStore <SDPatternOperator op> : PrivateMemOp <
+class StoreFrag<SDPatternOperator op> : PatFrag <
(ops node:$value, node:$ptr), (op node:$value, node:$ptr)
>;
-def load_private : PrivateLoad <load>;
-
-def truncstorei8_private : PrivateStore <truncstorei8>;
-def truncstorei16_private : PrivateStore <truncstorei16>;
-def store_private : PrivateStore <store>;
-
-class GlobalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
-}]>;
-
-// Global address space loads
-class GlobalLoad <SDPatternOperator op> : GlobalMemOp <
- (ops node:$ptr), (op node:$ptr)
+class StoreHi16<SDPatternOperator op> : PatFrag <
+ (ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr)
>;
-def global_load : GlobalLoad <load>;
-
-// Global address space stores
-class GlobalStore <SDPatternOperator op> : GlobalMemOp <
- (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
->;
-
-def global_store : GlobalStore <store>;
-def global_store_atomic : GlobalStore<atomic_store>;
-
+class PrivateAddress : CodePatPred<[{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS;
+}]>;
-class ConstantMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+class ConstantAddress : CodePatPred<[{
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
}]>;
-// Constant address space loads
-class ConstantLoad <SDPatternOperator op> : ConstantMemOp <
- (ops node:$ptr), (op node:$ptr)
->;
-
-def constant_load : ConstantLoad<load>;
-
-class LocalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+class LocalAddress : CodePatPred<[{
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
}]>;
-// Local address space loads
-class LocalLoad <SDPatternOperator op> : LocalMemOp <
- (ops node:$ptr), (op node:$ptr)
->;
+class GlobalAddress : CodePatPred<[{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
+}]>;
-class LocalStore <SDPatternOperator op> : LocalMemOp <
- (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
->;
+class GlobalLoadAddress : CodePatPred<[{
+ auto AS = cast<MemSDNode>(N)->getAddressSpace();
+ return AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS;
+}]>;
-class FlatMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
- return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUASI.FLAT_ADDRESS;
+class FlatLoadAddress : CodePatPred<[{
+ const auto AS = cast<MemSDNode>(N)->getAddressSpace();
+ return AS == AMDGPUASI.FLAT_ADDRESS ||
+ AS == AMDGPUASI.GLOBAL_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS;
}]>;
-class FlatLoad <SDPatternOperator op> : FlatMemOp <
- (ops node:$ptr), (op node:$ptr)
->;
+class FlatStoreAddress : CodePatPred<[{
+ const auto AS = cast<MemSDNode>(N)->getAddressSpace();
+ return AS == AMDGPUASI.FLAT_ADDRESS ||
+ AS == AMDGPUASI.GLOBAL_ADDRESS;
+}]>;
class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
(ld_node node:$ptr), [{
@@ -302,72 +306,105 @@ def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
}]>;
-def az_extloadi8_global : GlobalLoad <az_extloadi8>;
-def sextloadi8_global : GlobalLoad <sextloadi8>;
+def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
-def az_extloadi8_constant : ConstantLoad <az_extloadi8>;
-def sextloadi8_constant : ConstantLoad <sextloadi8>;
+def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
-def az_extloadi8_local : LocalLoad <az_extloadi8>;
-def sextloadi8_local : LocalLoad <sextloadi8>;
+class PrivateLoad <SDPatternOperator op> : LoadFrag <op>, PrivateAddress;
+class PrivateStore <SDPatternOperator op> : StoreFrag <op>, PrivateAddress;
-def extloadi8_private : PrivateLoad <az_extloadi8>;
-def sextloadi8_private : PrivateLoad <sextloadi8>;
+class LocalLoad <SDPatternOperator op> : LoadFrag <op>, LocalAddress;
+class LocalStore <SDPatternOperator op> : StoreFrag <op>, LocalAddress;
-def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+class GlobalLoad <SDPatternOperator op> : LoadFrag<op>, GlobalLoadAddress;
+class GlobalStore <SDPatternOperator op> : StoreFrag<op>, GlobalAddress;
-def az_extloadi16_global : GlobalLoad <az_extloadi16>;
-def sextloadi16_global : GlobalLoad <sextloadi16>;
+class FlatLoad <SDPatternOperator op> : LoadFrag <op>, FlatLoadAddress;
+class FlatStore <SDPatternOperator op> : StoreFrag <op>, FlatStoreAddress;
-def az_extloadi16_constant : ConstantLoad <az_extloadi16>;
-def sextloadi16_constant : ConstantLoad <sextloadi16>;
+class ConstantLoad <SDPatternOperator op> : LoadFrag <op>, ConstantAddress;
-def az_extloadi16_local : LocalLoad <az_extloadi16>;
-def sextloadi16_local : LocalLoad <sextloadi16>;
-def extloadi16_private : PrivateLoad <az_extloadi16>;
+def load_private : PrivateLoad <load>;
+def az_extloadi8_private : PrivateLoad <az_extloadi8>;
+def sextloadi8_private : PrivateLoad <sextloadi8>;
+def az_extloadi16_private : PrivateLoad <az_extloadi16>;
def sextloadi16_private : PrivateLoad <sextloadi16>;
-def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def az_extloadi32_global : GlobalLoad <az_extloadi32>;
+def store_private : PrivateStore <store>;
+def truncstorei8_private : PrivateStore<truncstorei8>;
+def truncstorei16_private : PrivateStore <truncstorei16>;
+def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress;
+def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress;
-def az_extloadi32_flat : FlatLoad <az_extloadi32>;
-def az_extloadi32_constant : ConstantLoad <az_extloadi32>;
+def load_global : GlobalLoad <load>;
+def sextloadi8_global : GlobalLoad <sextloadi8>;
+def az_extloadi8_global : GlobalLoad <az_extloadi8>;
+def sextloadi16_global : GlobalLoad <sextloadi16>;
+def az_extloadi16_global : GlobalLoad <az_extloadi16>;
+def atomic_load_global : GlobalLoad<atomic_load>;
+def store_global : GlobalStore <store>;
def truncstorei8_global : GlobalStore <truncstorei8>;
def truncstorei16_global : GlobalStore <truncstorei16>;
+def store_atomic_global : GlobalStore<atomic_store>;
+def truncstorei8_hi16_global : StoreHi16 <truncstorei8>, GlobalAddress;
+def truncstorei16_hi16_global : StoreHi16 <truncstorei16>, GlobalAddress;
-def local_store : LocalStore <store>;
+def load_local : LocalLoad <load>;
+def az_extloadi8_local : LocalLoad <az_extloadi8>;
+def sextloadi8_local : LocalLoad <sextloadi8>;
+def az_extloadi16_local : LocalLoad <az_extloadi16>;
+def sextloadi16_local : LocalLoad <sextloadi16>;
+
+def store_local : LocalStore <store>;
def truncstorei8_local : LocalStore <truncstorei8>;
def truncstorei16_local : LocalStore <truncstorei16>;
+def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress;
+def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress;
-def local_load : LocalLoad <load>;
-
-class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
- return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
-}]>;
-
-def local_load_aligned8bytes : Aligned8Bytes <
- (ops node:$ptr), (local_load node:$ptr)
+def load_align8_local : Aligned8Bytes <
+ (ops node:$ptr), (load_local node:$ptr)
>;
-def local_store_aligned8bytes : Aligned8Bytes <
- (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr)
+def store_align8_local : Aligned8Bytes <
+ (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
>;
+
+def load_flat : FlatLoad <load>;
+def az_extloadi8_flat : FlatLoad <az_extloadi8>;
+def sextloadi8_flat : FlatLoad <sextloadi8>;
+def az_extloadi16_flat : FlatLoad <az_extloadi16>;
+def sextloadi16_flat : FlatLoad <sextloadi16>;
+def atomic_load_flat : FlatLoad<atomic_load>;
+
+def store_flat : FlatStore <store>;
+def truncstorei8_flat : FlatStore <truncstorei8>;
+def truncstorei16_flat : FlatStore <truncstorei16>;
+def atomic_store_flat : FlatStore <atomic_store>;
+def truncstorei8_hi16_flat : StoreHi16<truncstorei8>, FlatStoreAddress;
+def truncstorei16_hi16_flat : StoreHi16<truncstorei16>, FlatStoreAddress;
+
+
+def constant_load : ConstantLoad<load>;
+def sextloadi8_constant : ConstantLoad <sextloadi8>;
+def az_extloadi8_constant : ConstantLoad <az_extloadi8>;
+def sextloadi16_constant : ConstantLoad <sextloadi16>;
+def az_extloadi16_constant : ConstantLoad <az_extloadi16>;
+
+
class local_binary_atomic_op<SDNode atomic_op> :
PatFrag<(ops node:$ptr, node:$value),
(atomic_op node:$ptr, node:$value), [{
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
}]>;
-
def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
@@ -385,26 +422,14 @@ def mskor_global : PatFrag<(ops node:$val, node:$ptr),
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
}]>;
-multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
-
- def _32_local : PatFrag <
- (ops node:$ptr, node:$cmp, node:$swap),
- (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
- AtomicSDNode *AN = cast<AtomicSDNode>(N);
- return AN->getMemoryVT() == MVT::i32 &&
- AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
- }]>;
-
- def _64_local : PatFrag<
+class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag<
(ops node:$ptr, node:$cmp, node:$swap),
(cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
AtomicSDNode *AN = cast<AtomicSDNode>(N);
- return AN->getMemoryVT() == MVT::i64 &&
- AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
- }]>;
-}
+ return AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
+}]>;
-defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>;
+def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>;
multiclass global_binary_atomic_op<SDNode atomic_op> {
def "" : PatFrag<
@@ -434,26 +459,25 @@ defm atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
defm atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
-//legacy
+// Legacy.
def AMDGPUatomic_cmp_swap_global : PatFrag<
- (ops node:$ptr, node:$value),
- (AMDGPUatomic_cmp_swap node:$ptr, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>;
+ (ops node:$ptr, node:$value),
+ (AMDGPUatomic_cmp_swap node:$ptr, node:$value)>, GlobalAddress;
def atomic_cmp_swap_global : PatFrag<
- (ops node:$ptr, node:$cmp, node:$value),
- (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>;
+ (ops node:$ptr, node:$cmp, node:$value),
+ (atomic_cmp_swap node:$ptr, node:$cmp, node:$value)>, GlobalAddress;
+
def atomic_cmp_swap_global_noret : PatFrag<
- (ops node:$ptr, node:$cmp, node:$value),
- (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+ (ops node:$ptr, node:$cmp, node:$value),
+ (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
def atomic_cmp_swap_global_ret : PatFrag<
- (ops node:$ptr, node:$cmp, node:$value),
- (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+ (ops node:$ptr, node:$cmp, node:$value),
+ (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
//===----------------------------------------------------------------------===//
// Misc Pattern Fragments
@@ -488,64 +512,11 @@ def FP_HALF : PatLeaf <
[{return N->isExactlyValue(0.5);}]
>;
-let isCodeGenOnly = 1, isPseudo = 1 in {
-
-let usesCustomInserter = 1 in {
-
-class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
- (outs rc:$dst),
- (ins rc:$src0),
- "CLAMP $dst, $src0",
- [(set f32:$dst, (AMDGPUclamp f32:$src0))]
->;
-
-class FABS <RegisterClass rc> : AMDGPUShaderInst <
- (outs rc:$dst),
- (ins rc:$src0),
- "FABS $dst, $src0",
- [(set f32:$dst, (fabs f32:$src0))]
->;
-
-class FNEG <RegisterClass rc> : AMDGPUShaderInst <
- (outs rc:$dst),
- (ins rc:$src0),
- "FNEG $dst, $src0",
- [(set f32:$dst, (fneg f32:$src0))]
->;
-
-} // usesCustomInserter = 1
-
-multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
- ComplexPattern addrPat> {
-let UseNamedOperandTable = 1 in {
-
- def RegisterLoad : AMDGPUShaderInst <
- (outs dstClass:$dst),
- (ins addrClass:$addr, i32imm:$chan),
- "RegisterLoad $dst, $addr",
- [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))]
- > {
- let isRegisterLoad = 1;
- }
-
- def RegisterStore : AMDGPUShaderInst <
- (outs),
- (ins dstClass:$val, addrClass:$addr, i32imm:$chan),
- "RegisterStore $val, $addr",
- [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))]
- > {
- let isRegisterStore = 1;
- }
-}
-}
-
-} // End isCodeGenOnly = 1, isPseudo = 1
-
/* Generic helper patterns for intrinsics */
/* -------------------------------------- */
class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul>
- : Pat <
+ : AMDGPUPat <
(fpow f32:$src0, f32:$src1),
(exp_ieee (mul f32:$src1, (log_ieee f32:$src0)))
>;
@@ -556,30 +527,34 @@ class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul>
/* Extract element pattern */
class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
SubRegIndex sub_reg>
- : Pat<
+ : AMDGPUPat<
(sub_type (extractelt vec_type:$src, sub_idx)),
(EXTRACT_SUBREG $src, sub_reg)
->;
+> {
+ let SubtargetPredicate = TruePredicate;
+}
/* Insert element pattern */
class Insert_Element <ValueType elem_type, ValueType vec_type,
int sub_idx, SubRegIndex sub_reg>
- : Pat <
+ : AMDGPUPat <
(insertelt vec_type:$vec, elem_type:$elem, sub_idx),
(INSERT_SUBREG $vec, $elem, sub_reg)
->;
+> {
+ let SubtargetPredicate = TruePredicate;
+}
// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
// can handle COPY instructions.
// bitconvert pattern
-class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat <
+class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : AMDGPUPat <
(dt (bitconvert (st rc:$src0))),
(dt rc:$src0)
>;
// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
// can handle COPY instructions.
-class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
+class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat <
(vt (AMDGPUdwordaddr (vt rc:$addr))),
(vt rc:$addr)
>;
@@ -591,30 +566,30 @@ multiclass BFIPatterns <Instruction BFI_INT,
RegisterClass RC64> {
// Definition from ISA doc:
// (y & x) | (z & ~x)
- def : Pat <
+ def : AMDGPUPat <
(or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
(BFI_INT $x, $y, $z)
>;
// SHA-256 Ch function
// z ^ (x & (y ^ z))
- def : Pat <
+ def : AMDGPUPat <
(xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
(BFI_INT $x, $y, $z)
>;
- def : Pat <
+ def : AMDGPUPat <
(fcopysign f32:$src0, f32:$src1),
(BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1)
>;
- def : Pat <
+ def : AMDGPUPat <
(f32 (fcopysign f32:$src0, f64:$src1)),
(BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0,
(i32 (EXTRACT_SUBREG $src1, sub1)))
>;
- def : Pat <
+ def : AMDGPUPat <
(f64 (fcopysign f64:$src0, f64:$src1)),
(REG_SEQUENCE RC64,
(i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
@@ -623,7 +598,7 @@ multiclass BFIPatterns <Instruction BFI_INT,
(i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
>;
- def : Pat <
+ def : AMDGPUPat <
(f64 (fcopysign f64:$src0, f32:$src1)),
(REG_SEQUENCE RC64,
(i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
@@ -636,7 +611,7 @@ multiclass BFIPatterns <Instruction BFI_INT,
// SHA-256 Ma patterns
// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
-class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
+class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : AMDGPUPat <
(or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
(BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
>;
@@ -653,24 +628,24 @@ def IMMPopCount : SDNodeXForm<imm, [{
}]>;
multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> {
- def : Pat <
+ def : AMDGPUPat <
(i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)),
(UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
>;
- def : Pat <
+ def : AMDGPUPat <
(srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
(UBFE $src, (i32 0), $width)
>;
- def : Pat <
+ def : AMDGPUPat <
(sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
(SBFE $src, (i32 0), $width)
>;
}
// rotr pattern
-class ROTRPattern <Instruction BIT_ALIGN> : Pat <
+class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
(rotr i32:$src0, i32:$src1),
(BIT_ALIGN $src0, $src0, $src1)
>;
@@ -681,7 +656,7 @@ class IntMed3Pat<Instruction med3Inst,
SDPatternOperator max,
SDPatternOperator max_oneuse,
SDPatternOperator min_oneuse,
- ValueType vt = i32> : Pat<
+ ValueType vt = i32> : AMDGPUPat<
(max (min_oneuse vt:$src0, vt:$src1),
(min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
(med3Inst $src0, $src1, $src2)
@@ -701,22 +676,24 @@ def cvt_flr_i32_f32 : PatFrag <
[{ (void)N; return TM.Options.NoNaNsFPMath; }]
>;
-class IMad24Pat<Instruction Inst> : Pat <
+class IMad24Pat<Instruction Inst, bit HasClamp = 0> : AMDGPUPat <
(add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
- (Inst $src0, $src1, $src2)
+ !if(HasClamp, (Inst $src0, $src1, $src2, (i1 0)),
+ (Inst $src0, $src1, $src2))
>;
-class UMad24Pat<Instruction Inst> : Pat <
+class UMad24Pat<Instruction Inst, bit HasClamp = 0> : AMDGPUPat <
(add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2),
- (Inst $src0, $src1, $src2)
+ !if(HasClamp, (Inst $src0, $src1, $src2, (i1 0)),
+ (Inst $src0, $src1, $src2))
>;
-class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
+class RcpPat<Instruction RcpInst, ValueType vt> : AMDGPUPat <
(fdiv FP_ONE, vt:$src),
(RcpInst $src)
>;
-class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
+class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
(AMDGPUrcp (fsqrt vt:$src)),
(RsqInst $src)
>;
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index cc56216c355b..b4704f6feb92 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -13,18 +13,14 @@
//===----------------------------------------------------------------------===//
#include "AMDGPULegalizerInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetOpcodes.h"
using namespace llvm;
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-#error "You shouldn't build this"
-#endif
-
AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
using namespace TargetOpcode;
@@ -53,6 +49,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
setAction({G_FCONSTANT, S32}, Legal);
+ setAction({G_FADD, S32}, Legal);
+
+ setAction({G_FMUL, S32}, Legal);
+
setAction({G_GEP, P1}, Legal);
setAction({G_GEP, P2}, Legal);
setAction({G_GEP, 1, S64}, Legal);
@@ -66,6 +66,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
setAction({G_LOAD, 1, P1}, Legal);
setAction({G_LOAD, 1, P2}, Legal);
+ setAction({G_OR, S32}, Legal);
+
setAction({G_SELECT, S32}, Legal);
setAction({G_SELECT, 1, S1}, Legal);
diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
new file mode 100644
index 000000000000..f594767c8edb
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -0,0 +1,1770 @@
+//===- AMDGPULibCalls.cpp -------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This file does AMD library function optimizations.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "amdgpu-simplifylib"
+
+#include "AMDGPU.h"
+#include "AMDGPULibFunc.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include <vector>
+#include <cmath>
+
+using namespace llvm;
+
+static cl::opt<bool> EnablePreLink("amdgpu-prelink",
+ cl::desc("Enable pre-link mode optimizations"),
+ cl::init(false),
+ cl::Hidden);
+
+static cl::list<std::string> UseNative("amdgpu-use-native",
+ cl::desc("Comma separated list of functions to replace with native, or all"),
+ cl::CommaSeparated, cl::ValueOptional,
+ cl::Hidden);
+
+#define MATH_PI 3.14159265358979323846264338327950288419716939937511
+#define MATH_E 2.71828182845904523536028747135266249775724709369996
+#define MATH_SQRT2 1.41421356237309504880168872420969807856967187537695
+
+#define MATH_LOG2E 1.4426950408889634073599246810018921374266459541529859
+#define MATH_LOG10E 0.4342944819032518276511289189166050822943970058036665
+// Value of log2(10)
+#define MATH_LOG2_10 3.3219280948873623478703194294893901758648313930245806
+// Value of 1 / log2(10)
+#define MATH_RLOG2_10 0.3010299956639811952137388947244930267681898814621085
+// Value of 1 / M_LOG2E_F = 1 / log2(e)
+#define MATH_RLOG2_E 0.6931471805599453094172321214581765680755001343602552
+
+namespace llvm {
+
+class AMDGPULibCalls {
+private:
+
+ typedef llvm::AMDGPULibFunc FuncInfo;
+
+ // -fuse-native.
+ bool AllNative = false;
+
+ bool useNativeFunc(const StringRef F) const;
+
+ // Return a pointer (pointer expr) to the function if function defintion with
+ // "FuncName" exists. It may create a new function prototype in pre-link mode.
+ Constant *getFunction(Module *M, const FuncInfo& fInfo);
+
+ // Replace a normal function with its native version.
+ bool replaceWithNative(CallInst *CI, const FuncInfo &FInfo);
+
+ bool parseFunctionName(const StringRef& FMangledName,
+ FuncInfo *FInfo=nullptr /*out*/);
+
+ bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
+
+ /* Specialized optimizations */
+
+ // recip (half or native)
+ bool fold_recip(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+ // divide (half or native)
+ bool fold_divide(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+ // pow/powr/pown
+ bool fold_pow(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+ // rootn
+ bool fold_rootn(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+ // fma/mad
+ bool fold_fma_mad(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+ // -fuse-native for sincos
+ bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
+
+ // evaluate calls if calls' arguments are constants.
+ bool evaluateScalarMathFunc(FuncInfo &FInfo, double& Res0,
+ double& Res1, Constant *copr0, Constant *copr1, Constant *copr2);
+ bool evaluateCall(CallInst *aCI, FuncInfo &FInfo);
+
+ // exp
+ bool fold_exp(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+ // exp2
+ bool fold_exp2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+ // exp10
+ bool fold_exp10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+ // log
+ bool fold_log(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+ // log2
+ bool fold_log2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+ // log10
+ bool fold_log10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+ // sqrt
+ bool fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+ // sin/cos
+ bool fold_sincos(CallInst * CI, IRBuilder<> &B, AliasAnalysis * AA);
+
+ // __read_pipe/__write_pipe
+ bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo);
+
+ // Get insertion point at entry.
+ BasicBlock::iterator getEntryIns(CallInst * UI);
+ // Insert an Alloc instruction.
+ AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);
+ // Get a scalar native builtin signle argument FP function
+ Constant* getNativeFunction(Module* M, const FuncInfo &FInfo);
+
+protected:
+ CallInst *CI;
+
+ bool isUnsafeMath(const CallInst *CI) const;
+
+ void replaceCall(Value *With) {
+ CI->replaceAllUsesWith(With);
+ CI->eraseFromParent();
+ }
+
+public:
+ bool fold(CallInst *CI, AliasAnalysis *AA = nullptr);
+
+ void initNativeFuncs();
+
+ // Replace a normal math function call with that native version
+ bool useNative(CallInst *CI);
+};
+
+} // end llvm namespace
+
+namespace {
+
+ class AMDGPUSimplifyLibCalls : public FunctionPass {
+
+ AMDGPULibCalls Simplifier;
+
+ const TargetOptions Options;
+
+ public:
+ static char ID; // Pass identification
+
+ AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions())
+ : FunctionPass(ID), Options(Opt) {
+ initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ }
+
+ bool runOnFunction(Function &M) override;
+ };
+
+ class AMDGPUUseNativeCalls : public FunctionPass {
+
+ AMDGPULibCalls Simplifier;
+
+ public:
+ static char ID; // Pass identification
+
+ AMDGPUUseNativeCalls() : FunctionPass(ID) {
+ initializeAMDGPUUseNativeCallsPass(*PassRegistry::getPassRegistry());
+ Simplifier.initNativeFuncs();
+ }
+
+ bool runOnFunction(Function &F) override;
+ };
+
+} // end anonymous namespace.
+
+char AMDGPUSimplifyLibCalls::ID = 0;
+char AMDGPUUseNativeCalls::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib",
+ "Simplify well-known AMD library calls", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib",
+ "Simplify well-known AMD library calls", false, false)
+
+INITIALIZE_PASS(AMDGPUUseNativeCalls, "amdgpu-usenative",
+ "Replace builtin math calls with that native versions.",
+ false, false)
+
+template <typename IRB>
+static CallInst *CreateCallEx(IRB &B, Value *Callee, Value *Arg,
+ const Twine &Name = "") {
+ CallInst *R = B.CreateCall(Callee, Arg, Name);
+ if (Function* F = dyn_cast<Function>(Callee))
+ R->setCallingConv(F->getCallingConv());
+ return R;
+}
+
+template <typename IRB>
+static CallInst *CreateCallEx2(IRB &B, Value *Callee, Value *Arg1, Value *Arg2,
+ const Twine &Name = "") {
+ CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
+ if (Function* F = dyn_cast<Function>(Callee))
+ R->setCallingConv(F->getCallingConv());
+ return R;
+}
+
+// Data structures for table-driven optimizations.
+// FuncTbl works for both f32 and f64 functions with 1 input argument
+
+struct TableEntry {
+ double result;
+ double input;
+};
+
+/* a list of {result, input} */
+static const TableEntry tbl_acos[] = {
+ {MATH_PI/2.0, 0.0},
+ {MATH_PI/2.0, -0.0},
+ {0.0, 1.0},
+ {MATH_PI, -1.0}
+};
+static const TableEntry tbl_acosh[] = {
+ {0.0, 1.0}
+};
+static const TableEntry tbl_acospi[] = {
+ {0.5, 0.0},
+ {0.5, -0.0},
+ {0.0, 1.0},
+ {1.0, -1.0}
+};
+static const TableEntry tbl_asin[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0},
+ {MATH_PI/2.0, 1.0},
+ {-MATH_PI/2.0, -1.0}
+};
+static const TableEntry tbl_asinh[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0}
+};
+static const TableEntry tbl_asinpi[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0},
+ {0.5, 1.0},
+ {-0.5, -1.0}
+};
+static const TableEntry tbl_atan[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0},
+ {MATH_PI/4.0, 1.0},
+ {-MATH_PI/4.0, -1.0}
+};
+static const TableEntry tbl_atanh[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0}
+};
+static const TableEntry tbl_atanpi[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0},
+ {0.25, 1.0},
+ {-0.25, -1.0}
+};
+static const TableEntry tbl_cbrt[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0},
+ {1.0, 1.0},
+ {-1.0, -1.0},
+};
+static const TableEntry tbl_cos[] = {
+ {1.0, 0.0},
+ {1.0, -0.0}
+};
+static const TableEntry tbl_cosh[] = {
+ {1.0, 0.0},
+ {1.0, -0.0}
+};
+static const TableEntry tbl_cospi[] = {
+ {1.0, 0.0},
+ {1.0, -0.0}
+};
+static const TableEntry tbl_erfc[] = {
+ {1.0, 0.0},
+ {1.0, -0.0}
+};
+static const TableEntry tbl_erf[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0}
+};
+static const TableEntry tbl_exp[] = {
+ {1.0, 0.0},
+ {1.0, -0.0},
+ {MATH_E, 1.0}
+};
+static const TableEntry tbl_exp2[] = {
+ {1.0, 0.0},
+ {1.0, -0.0},
+ {2.0, 1.0}
+};
+static const TableEntry tbl_exp10[] = {
+ {1.0, 0.0},
+ {1.0, -0.0},
+ {10.0, 1.0}
+};
+static const TableEntry tbl_expm1[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0}
+};
+static const TableEntry tbl_log[] = {
+ {0.0, 1.0},
+ {1.0, MATH_E}
+};
+static const TableEntry tbl_log2[] = {
+ {0.0, 1.0},
+ {1.0, 2.0}
+};
+static const TableEntry tbl_log10[] = {
+ {0.0, 1.0},
+ {1.0, 10.0}
+};
+static const TableEntry tbl_rsqrt[] = {
+ {1.0, 1.0},
+ {1.0/MATH_SQRT2, 2.0}
+};
+static const TableEntry tbl_sin[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0}
+};
+static const TableEntry tbl_sinh[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0}
+};
+static const TableEntry tbl_sinpi[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0}
+};
+static const TableEntry tbl_sqrt[] = {
+ {0.0, 0.0},
+ {1.0, 1.0},
+ {MATH_SQRT2, 2.0}
+};
+static const TableEntry tbl_tan[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0}
+};
+static const TableEntry tbl_tanh[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0}
+};
+static const TableEntry tbl_tanpi[] = {
+ {0.0, 0.0},
+ {-0.0, -0.0}
+};
+static const TableEntry tbl_tgamma[] = {
+ {1.0, 1.0},
+ {1.0, 2.0},
+ {2.0, 3.0},
+ {6.0, 4.0}
+};
+
+static bool HasNative(AMDGPULibFunc::EFuncId id) {
+ switch(id) {
+ case AMDGPULibFunc::EI_DIVIDE:
+ case AMDGPULibFunc::EI_COS:
+ case AMDGPULibFunc::EI_EXP:
+ case AMDGPULibFunc::EI_EXP2:
+ case AMDGPULibFunc::EI_EXP10:
+ case AMDGPULibFunc::EI_LOG:
+ case AMDGPULibFunc::EI_LOG2:
+ case AMDGPULibFunc::EI_LOG10:
+ case AMDGPULibFunc::EI_POWR:
+ case AMDGPULibFunc::EI_RECIP:
+ case AMDGPULibFunc::EI_RSQRT:
+ case AMDGPULibFunc::EI_SIN:
+ case AMDGPULibFunc::EI_SINCOS:
+ case AMDGPULibFunc::EI_SQRT:
+ case AMDGPULibFunc::EI_TAN:
+ return true;
+ default:;
+ }
+ return false;
+}
+
+struct TableRef {
+ size_t size;
+ const TableEntry *table; // variable size: from 0 to (size - 1)
+
+ TableRef() : size(0), table(nullptr) {}
+
+ template <size_t N>
+ TableRef(const TableEntry (&tbl)[N]) : size(N), table(&tbl[0]) {}
+};
+
+static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
+ switch(id) {
+ case AMDGPULibFunc::EI_ACOS: return TableRef(tbl_acos);
+ case AMDGPULibFunc::EI_ACOSH: return TableRef(tbl_acosh);
+ case AMDGPULibFunc::EI_ACOSPI: return TableRef(tbl_acospi);
+ case AMDGPULibFunc::EI_ASIN: return TableRef(tbl_asin);
+ case AMDGPULibFunc::EI_ASINH: return TableRef(tbl_asinh);
+ case AMDGPULibFunc::EI_ASINPI: return TableRef(tbl_asinpi);
+ case AMDGPULibFunc::EI_ATAN: return TableRef(tbl_atan);
+ case AMDGPULibFunc::EI_ATANH: return TableRef(tbl_atanh);
+ case AMDGPULibFunc::EI_ATANPI: return TableRef(tbl_atanpi);
+ case AMDGPULibFunc::EI_CBRT: return TableRef(tbl_cbrt);
+ case AMDGPULibFunc::EI_NCOS:
+ case AMDGPULibFunc::EI_COS: return TableRef(tbl_cos);
+ case AMDGPULibFunc::EI_COSH: return TableRef(tbl_cosh);
+ case AMDGPULibFunc::EI_COSPI: return TableRef(tbl_cospi);
+ case AMDGPULibFunc::EI_ERFC: return TableRef(tbl_erfc);
+ case AMDGPULibFunc::EI_ERF: return TableRef(tbl_erf);
+ case AMDGPULibFunc::EI_EXP: return TableRef(tbl_exp);
+ case AMDGPULibFunc::EI_NEXP2:
+ case AMDGPULibFunc::EI_EXP2: return TableRef(tbl_exp2);
+ case AMDGPULibFunc::EI_EXP10: return TableRef(tbl_exp10);
+ case AMDGPULibFunc::EI_EXPM1: return TableRef(tbl_expm1);
+ case AMDGPULibFunc::EI_LOG: return TableRef(tbl_log);
+ case AMDGPULibFunc::EI_NLOG2:
+ case AMDGPULibFunc::EI_LOG2: return TableRef(tbl_log2);
+ case AMDGPULibFunc::EI_LOG10: return TableRef(tbl_log10);
+ case AMDGPULibFunc::EI_NRSQRT:
+ case AMDGPULibFunc::EI_RSQRT: return TableRef(tbl_rsqrt);
+ case AMDGPULibFunc::EI_NSIN:
+ case AMDGPULibFunc::EI_SIN: return TableRef(tbl_sin);
+ case AMDGPULibFunc::EI_SINH: return TableRef(tbl_sinh);
+ case AMDGPULibFunc::EI_SINPI: return TableRef(tbl_sinpi);
+ case AMDGPULibFunc::EI_NSQRT:
+ case AMDGPULibFunc::EI_SQRT: return TableRef(tbl_sqrt);
+ case AMDGPULibFunc::EI_TAN: return TableRef(tbl_tan);
+ case AMDGPULibFunc::EI_TANH: return TableRef(tbl_tanh);
+ case AMDGPULibFunc::EI_TANPI: return TableRef(tbl_tanpi);
+ case AMDGPULibFunc::EI_TGAMMA: return TableRef(tbl_tgamma);
+ default:;
+ }
+ return TableRef();
+}
+
+static inline int getVecSize(const AMDGPULibFunc& FInfo) {
+ return FInfo.getLeads()[0].VectorSize;
+}
+
+static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
+ return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType;
+}
+
+Constant *AMDGPULibCalls::getFunction(Module *M, const FuncInfo& fInfo) {
+ // If we are doing PreLinkOpt, the function is external. So it is safe to
+ // use getOrInsertFunction() at this stage.
+
+ return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
+ : AMDGPULibFunc::getFunction(M, fInfo);
+}
+
+bool AMDGPULibCalls::parseFunctionName(const StringRef& FMangledName,
+ FuncInfo *FInfo) {
+ return AMDGPULibFunc::parse(FMangledName, *FInfo);
+}
+
+bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const {
+ if (auto Op = dyn_cast<FPMathOperator>(CI))
+ if (Op->isFast())
+ return true;
+ const Function *F = CI->getParent()->getParent();
+ Attribute Attr = F->getFnAttribute("unsafe-fp-math");
+ return Attr.getValueAsString() == "true";
+}
+
+bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
+ return AllNative ||
+ std::find(UseNative.begin(), UseNative.end(), F) != UseNative.end();
+}
+
+void AMDGPULibCalls::initNativeFuncs() {
+ AllNative = useNativeFunc("all") ||
+ (UseNative.getNumOccurrences() && UseNative.size() == 1 &&
+ UseNative.begin()->empty());
+}
+
+bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
+ bool native_sin = useNativeFunc("sin");
+ bool native_cos = useNativeFunc("cos");
+
+ if (native_sin && native_cos) {
+ Module *M = aCI->getModule();
+ Value *opr0 = aCI->getArgOperand(0);
+
+ AMDGPULibFunc nf;
+ nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType;
+ nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize;
+
+ nf.setPrefix(AMDGPULibFunc::NATIVE);
+ nf.setId(AMDGPULibFunc::EI_SIN);
+ Constant *sinExpr = getFunction(M, nf);
+
+ nf.setPrefix(AMDGPULibFunc::NATIVE);
+ nf.setId(AMDGPULibFunc::EI_COS);
+ Constant *cosExpr = getFunction(M, nf);
+ if (sinExpr && cosExpr) {
+ Value *sinval = CallInst::Create(sinExpr, opr0, "splitsin", aCI);
+ Value *cosval = CallInst::Create(cosExpr, opr0, "splitcos", aCI);
+ new StoreInst(cosval, aCI->getArgOperand(1), aCI);
+
+ DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
+ << " with native version of sin/cos");
+
+ replaceCall(sinval);
+ return true;
+ }
+ }
+ return false;
+}
+
+bool AMDGPULibCalls::useNative(CallInst *aCI) {
+ CI = aCI;
+ Function *Callee = aCI->getCalledFunction();
+
+ FuncInfo FInfo;
+ if (!parseFunctionName(Callee->getName(), &FInfo) || !FInfo.isMangled() ||
+ FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
+ getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()) ||
+ !(AllNative || useNativeFunc(FInfo.getName()))) {
+ return false;
+ }
+
+ if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
+ return sincosUseNative(aCI, FInfo);
+
+ FInfo.setPrefix(AMDGPULibFunc::NATIVE);
+ Constant *F = getFunction(aCI->getModule(), FInfo);
+ if (!F)
+ return false;
+
+ aCI->setCalledFunction(F);
+ DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
+ << " with native version");
+ return true;
+}
+
+// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
+// builtin, with appended type size and alignment arguments, where 2 or 4
+// indicates the original number of arguments. The library has optimized version
+// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
+// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
+// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
+// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
+bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
+ FuncInfo &FInfo) {
+ auto *Callee = CI->getCalledFunction();
+ if (!Callee->isDeclaration())
+ return false;
+
+ assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
+ auto *M = Callee->getParent();
+ auto &Ctx = M->getContext();
+ std::string Name = Callee->getName();
+ auto NumArg = CI->getNumArgOperands();
+ if (NumArg != 4 && NumArg != 6)
+ return false;
+ auto *PacketSize = CI->getArgOperand(NumArg - 2);
+ auto *PacketAlign = CI->getArgOperand(NumArg - 1);
+ if (!isa<ConstantInt>(PacketSize) || !isa<ConstantInt>(PacketAlign))
+ return false;
+ unsigned Size = cast<ConstantInt>(PacketSize)->getZExtValue();
+ unsigned Align = cast<ConstantInt>(PacketAlign)->getZExtValue();
+ if (Size != Align || !isPowerOf2_32(Size))
+ return false;
+
+ Type *PtrElemTy;
+ if (Size <= 8)
+ PtrElemTy = Type::getIntNTy(Ctx, Size * 8);
+ else
+ PtrElemTy = VectorType::get(Type::getInt64Ty(Ctx), Size / 8);
+ unsigned PtrArgLoc = CI->getNumArgOperands() - 3;
+ auto PtrArg = CI->getArgOperand(PtrArgLoc);
+ unsigned PtrArgAS = PtrArg->getType()->getPointerAddressSpace();
+ auto *PtrTy = llvm::PointerType::get(PtrElemTy, PtrArgAS);
+
+ SmallVector<llvm::Type *, 6> ArgTys;
+ for (unsigned I = 0; I != PtrArgLoc; ++I)
+ ArgTys.push_back(CI->getArgOperand(I)->getType());
+ ArgTys.push_back(PtrTy);
+
+ Name = Name + "_" + std::to_string(Size);
+ auto *FTy = FunctionType::get(Callee->getReturnType(),
+ ArrayRef<Type *>(ArgTys), false);
+ AMDGPULibFunc NewLibFunc(Name, FTy);
+ auto *F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc);
+ if (!F)
+ return false;
+
+ auto *BCast = B.CreatePointerCast(PtrArg, PtrTy);
+ SmallVector<Value *, 6> Args;
+ for (unsigned I = 0; I != PtrArgLoc; ++I)
+ Args.push_back(CI->getArgOperand(I));
+ Args.push_back(BCast);
+
+ auto *NCI = B.CreateCall(F, Args);
+ NCI->setAttributes(CI->getAttributes());
+ CI->replaceAllUsesWith(NCI);
+ CI->dropAllReferences();
+ CI->eraseFromParent();
+
+ return true;
+}
+
+// This function returns false if no change; return true otherwise.
+bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
+ this->CI = CI;
+ Function *Callee = CI->getCalledFunction();
+
+ // Ignore indirect calls.
+ if (Callee == 0) return false;
+
+ FuncInfo FInfo;
+ if (!parseFunctionName(Callee->getName(), &FInfo))
+ return false;
+
+ // Further check the number of arguments to see if they match.
+ if (CI->getNumArgOperands() != FInfo.getNumArgs())
+ return false;
+
+ BasicBlock *BB = CI->getParent();
+ LLVMContext &Context = CI->getParent()->getContext();
+ IRBuilder<> B(Context);
+
+ // Set the builder to the instruction after the call.
+ B.SetInsertPoint(BB, CI->getIterator());
+
+ // Copy fast flags from the original call.
+ if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI))
+ B.setFastMathFlags(FPOp->getFastMathFlags());
+
+ if (TDOFold(CI, FInfo))
+ return true;
+
+ // Under unsafe-math, evaluate calls if possible.
+ // According to Brian Sumner, we can do this for all f32 function calls
+ // using host's double function calls.
+ if (isUnsafeMath(CI) && evaluateCall(CI, FInfo))
+ return true;
+
+ // Specilized optimizations for each function call
+ switch (FInfo.getId()) {
+ case AMDGPULibFunc::EI_RECIP:
+ // skip vector function
+ assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE ||
+ FInfo.getPrefix() == AMDGPULibFunc::HALF) &&
+ "recip must be an either native or half function");
+ return (getVecSize(FInfo) != 1) ? false : fold_recip(CI, B, FInfo);
+
+ case AMDGPULibFunc::EI_DIVIDE:
+ // skip vector function
+ assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE ||
+ FInfo.getPrefix() == AMDGPULibFunc::HALF) &&
+ "divide must be an either native or half function");
+ return (getVecSize(FInfo) != 1) ? false : fold_divide(CI, B, FInfo);
+
+ case AMDGPULibFunc::EI_POW:
+ case AMDGPULibFunc::EI_POWR:
+ case AMDGPULibFunc::EI_POWN:
+ return fold_pow(CI, B, FInfo);
+
+ case AMDGPULibFunc::EI_ROOTN:
+ // skip vector function
+ return (getVecSize(FInfo) != 1) ? false : fold_rootn(CI, B, FInfo);
+
+ case AMDGPULibFunc::EI_FMA:
+ case AMDGPULibFunc::EI_MAD:
+ case AMDGPULibFunc::EI_NFMA:
+ // skip vector function
+ return (getVecSize(FInfo) != 1) ? false : fold_fma_mad(CI, B, FInfo);
+
+ case AMDGPULibFunc::EI_SQRT:
+ return isUnsafeMath(CI) && fold_sqrt(CI, B, FInfo);
+ case AMDGPULibFunc::EI_COS:
+ case AMDGPULibFunc::EI_SIN:
+ if ((getArgType(FInfo) == AMDGPULibFunc::F32 ||
+ getArgType(FInfo) == AMDGPULibFunc::F64)
+ && (FInfo.getPrefix() == AMDGPULibFunc::NOPFX))
+ return fold_sincos(CI, B, AA);
+
+ break;
+ case AMDGPULibFunc::EI_READ_PIPE_2:
+ case AMDGPULibFunc::EI_READ_PIPE_4:
+ case AMDGPULibFunc::EI_WRITE_PIPE_2:
+ case AMDGPULibFunc::EI_WRITE_PIPE_4:
+ return fold_read_write_pipe(CI, B, FInfo);
+
+ default:
+ break;
+ }
+
+ return false;
+}
+
+bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
+ // Table-Driven optimization
+ const TableRef tr = getOptTable(FInfo.getId());
+ if (tr.size==0)
+ return false;
+
+ int const sz = (int)tr.size;
+ const TableEntry * const ftbl = tr.table;
+ Value *opr0 = CI->getArgOperand(0);
+
+ if (getVecSize(FInfo) > 1) {
+ if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(opr0)) {
+ SmallVector<double, 0> DVal;
+ for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) {
+ ConstantFP *eltval = dyn_cast<ConstantFP>(
+ CV->getElementAsConstant((unsigned)eltNo));
+ assert(eltval && "Non-FP arguments in math function!");
+ bool found = false;
+ for (int i=0; i < sz; ++i) {
+ if (eltval->isExactlyValue(ftbl[i].input)) {
+ DVal.push_back(ftbl[i].result);
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ // This vector constants not handled yet.
+ return false;
+ }
+ }
+ LLVMContext &context = CI->getParent()->getParent()->getContext();
+ Constant *nval;
+ if (getArgType(FInfo) == AMDGPULibFunc::F32) {
+ SmallVector<float, 0> FVal;
+ for (unsigned i = 0; i < DVal.size(); ++i) {
+ FVal.push_back((float)DVal[i]);
+ }
+ ArrayRef<float> tmp(FVal);
+ nval = ConstantDataVector::get(context, tmp);
+ } else { // F64
+ ArrayRef<double> tmp(DVal);
+ nval = ConstantDataVector::get(context, tmp);
+ }
+ DEBUG(errs() << "AMDIC: " << *CI
+ << " ---> " << *nval << "\n");
+ replaceCall(nval);
+ return true;
+ }
+ } else {
+ // Scalar version
+ if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
+ for (int i = 0; i < sz; ++i) {
+ if (CF->isExactlyValue(ftbl[i].input)) {
+ Value *nval = ConstantFP::get(CF->getType(), ftbl[i].result);
+ DEBUG(errs() << "AMDIC: " << *CI
+ << " ---> " << *nval << "\n");
+ replaceCall(nval);
+ return true;
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) {
+ Module *M = CI->getModule();
+ if (getArgType(FInfo) != AMDGPULibFunc::F32 ||
+ FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
+ !HasNative(FInfo.getId()))
+ return false;
+
+ AMDGPULibFunc nf = FInfo;
+ nf.setPrefix(AMDGPULibFunc::NATIVE);
+ if (Constant *FPExpr = getFunction(M, nf)) {
+ DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
+
+ CI->setCalledFunction(FPExpr);
+
+ DEBUG(dbgs() << *CI << '\n');
+
+ return true;
+ }
+ return false;
+}
+
+// [native_]half_recip(c) ==> 1.0/c
+bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B,
+ const FuncInfo &FInfo) {
+ Value *opr0 = CI->getArgOperand(0);
+ if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
+ // Just create a normal div. Later, InstCombine will be able
+ // to compute the divide into a constant (avoid check float infinity
+ // or subnormal at this point).
+ Value *nval = B.CreateFDiv(ConstantFP::get(CF->getType(), 1.0),
+ opr0,
+ "recip2div");
+ DEBUG(errs() << "AMDIC: " << *CI
+ << " ---> " << *nval << "\n");
+ replaceCall(nval);
+ return true;
+ }
+ return false;
+}
+
+// [native_]half_divide(x, c) ==> x/c
+bool AMDGPULibCalls::fold_divide(CallInst *CI, IRBuilder<> &B,
+ const FuncInfo &FInfo) {
+ Value *opr0 = CI->getArgOperand(0);
+ Value *opr1 = CI->getArgOperand(1);
+ ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
+ ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
+
+ if ((CF0 && CF1) || // both are constants
+ (CF1 && (getArgType(FInfo) == AMDGPULibFunc::F32)))
+ // CF1 is constant && f32 divide
+ {
+ Value *nval1 = B.CreateFDiv(ConstantFP::get(opr1->getType(), 1.0),
+ opr1, "__div2recip");
+ Value *nval = B.CreateFMul(opr0, nval1, "__div2mul");
+ replaceCall(nval);
+ return true;
+ }
+ return false;
+}
+
+namespace llvm {
+static double log2(double V) {
+#if _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE || _POSIX_C_SOURCE >= 200112L
+ return ::log2(V);
+#else
+ return log(V) / 0.693147180559945309417;
+#endif
+}
+}
+
+bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
+ const FuncInfo &FInfo) {
+ assert((FInfo.getId() == AMDGPULibFunc::EI_POW ||
+ FInfo.getId() == AMDGPULibFunc::EI_POWR ||
+ FInfo.getId() == AMDGPULibFunc::EI_POWN) &&
+ "fold_pow: encounter a wrong function call");
+
+ Value *opr0, *opr1;
+ ConstantFP *CF;
+ ConstantInt *CINT;
+ ConstantAggregateZero *CZero;
+ Type *eltType;
+
+ opr0 = CI->getArgOperand(0);
+ opr1 = CI->getArgOperand(1);
+ CZero = dyn_cast<ConstantAggregateZero>(opr1);
+ if (getVecSize(FInfo) == 1) {
+ eltType = opr0->getType();
+ CF = dyn_cast<ConstantFP>(opr1);
+ CINT = dyn_cast<ConstantInt>(opr1);
+ } else {
+ VectorType *VTy = dyn_cast<VectorType>(opr0->getType());
+ assert(VTy && "Oprand of vector function should be of vectortype");
+ eltType = VTy->getElementType();
+ ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr1);
+
+ // Now, only Handle vector const whose elements have the same value.
+ CF = CDV ? dyn_cast_or_null<ConstantFP>(CDV->getSplatValue()) : nullptr;
+ CINT = CDV ? dyn_cast_or_null<ConstantInt>(CDV->getSplatValue()) : nullptr;
+ }
+
+ // No unsafe math , no constant argument, do nothing
+ if (!isUnsafeMath(CI) && !CF && !CINT && !CZero)
+ return false;
+
+ // 0x1111111 means that we don't do anything for this call.
+ int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111);
+
+ if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0) || CZero) {
+ // pow/powr/pown(x, 0) == 1
+ DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n");
+ Constant *cnval = ConstantFP::get(eltType, 1.0);
+ if (getVecSize(FInfo) > 1) {
+ cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
+ }
+ replaceCall(cnval);
+ return true;
+ }
+ if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) {
+ // pow/powr/pown(x, 1.0) = x
+ DEBUG(errs() << "AMDIC: " << *CI
+ << " ---> " << *opr0 << "\n");
+ replaceCall(opr0);
+ return true;
+ }
+ if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) {
+ // pow/powr/pown(x, 2.0) = x*x
+ DEBUG(errs() << "AMDIC: " << *CI
+ << " ---> " << *opr0 << " * " << *opr0 << "\n");
+ Value *nval = B.CreateFMul(opr0, opr0, "__pow2");
+ replaceCall(nval);
+ return true;
+ }
+ if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) {
+ // pow/powr/pown(x, -1.0) = 1.0/x
+ DEBUG(errs() << "AMDIC: " << *CI
+ << " ---> 1 / " << *opr0 << "\n");
+ Constant *cnval = ConstantFP::get(eltType, 1.0);
+ if (getVecSize(FInfo) > 1) {
+ cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
+ }
+ Value *nval = B.CreateFDiv(cnval, opr0, "__powrecip");
+ replaceCall(nval);
+ return true;
+ }
+
+ Module *M = CI->getModule();
+ if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) {
+ // pow[r](x, [-]0.5) = sqrt(x)
+ bool issqrt = CF->isExactlyValue(0.5);
+ if (Constant *FPExpr = getFunction(M,
+ AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
+ : AMDGPULibFunc::EI_RSQRT, FInfo))) {
+ DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << FInfo.getName().c_str() << "(" << *opr0 << ")\n");
+ Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt"
+ : "__pow2rsqrt");
+ replaceCall(nval);
+ return true;
+ }
+ }
+
+ if (!isUnsafeMath(CI))
+ return false;
+
+ // Unsafe Math optimization
+
+ // Remember that ci_opr1 is set if opr1 is integral
+ if (CF) {
+ double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
+ ? (double)CF->getValueAPF().convertToFloat()
+ : CF->getValueAPF().convertToDouble();
+ int ival = (int)dval;
+ if ((double)ival == dval) {
+ ci_opr1 = ival;
+ } else
+ ci_opr1 = 0x11111111;
+ }
+
+ // pow/powr/pown(x, c) = [1/](x*x*..x); where
+ // trunc(c) == c && the number of x == c && |c| <= 12
+ unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1;
+ if (abs_opr1 <= 12) {
+ Constant *cnval;
+ Value *nval;
+ if (abs_opr1 == 0) {
+ cnval = ConstantFP::get(eltType, 1.0);
+ if (getVecSize(FInfo) > 1) {
+ cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
+ }
+ nval = cnval;
+ } else {
+ Value *valx2 = nullptr;
+ nval = nullptr;
+ while (abs_opr1 > 0) {
+ valx2 = valx2 ? B.CreateFMul(valx2, valx2, "__powx2") : opr0;
+ if (abs_opr1 & 1) {
+ nval = nval ? B.CreateFMul(nval, valx2, "__powprod") : valx2;
+ }
+ abs_opr1 >>= 1;
+ }
+ }
+
+ if (ci_opr1 < 0) {
+ cnval = ConstantFP::get(eltType, 1.0);
+ if (getVecSize(FInfo) > 1) {
+ cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
+ }
+ nval = B.CreateFDiv(cnval, nval, "__1powprod");
+ }
+ DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0 << ")\n");
+ replaceCall(nval);
+ return true;
+ }
+
+ // powr ---> exp2(y * log2(x))
+ // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
+ Constant *ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2,
+ FInfo));
+ if (!ExpExpr)
+ return false;
+
+ bool needlog = false;
+ bool needabs = false;
+ bool needcopysign = false;
+ Constant *cnval = nullptr;
+ if (getVecSize(FInfo) == 1) {
+ CF = dyn_cast<ConstantFP>(opr0);
+
+ if (CF) {
+ double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
+ ? (double)CF->getValueAPF().convertToFloat()
+ : CF->getValueAPF().convertToDouble();
+
+ V = log2(std::abs(V));
+ cnval = ConstantFP::get(eltType, V);
+ needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR) &&
+ CF->isNegative();
+ } else {
+ needlog = true;
+ needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
+ (!CF || CF->isNegative());
+ }
+ } else {
+ ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr0);
+
+ if (!CDV) {
+ needlog = true;
+ needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
+ } else {
+ assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
+ "Wrong vector size detected");
+
+ SmallVector<double, 0> DVal;
+ for (int i=0; i < getVecSize(FInfo); ++i) {
+ double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
+ ? (double)CDV->getElementAsFloat(i)
+ : CDV->getElementAsDouble(i);
+ if (V < 0.0) needcopysign = true;
+ V = log2(std::abs(V));
+ DVal.push_back(V);
+ }
+ if (getArgType(FInfo) == AMDGPULibFunc::F32) {
+ SmallVector<float, 0> FVal;
+ for (unsigned i=0; i < DVal.size(); ++i) {
+ FVal.push_back((float)DVal[i]);
+ }
+ ArrayRef<float> tmp(FVal);
+ cnval = ConstantDataVector::get(M->getContext(), tmp);
+ } else {
+ ArrayRef<double> tmp(DVal);
+ cnval = ConstantDataVector::get(M->getContext(), tmp);
+ }
+ }
+ }
+
+ if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
+ // We cannot handle corner cases for a general pow() function, give up
+ // unless y is a constant integral value. Then proceed as if it were pown.
+ if (getVecSize(FInfo) == 1) {
+ if (const ConstantFP *CF = dyn_cast<ConstantFP>(opr1)) {
+ double y = (getArgType(FInfo) == AMDGPULibFunc::F32)
+ ? (double)CF->getValueAPF().convertToFloat()
+ : CF->getValueAPF().convertToDouble();
+ if (y != (double)(int64_t)y)
+ return false;
+ } else
+ return false;
+ } else {
+ if (const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr1)) {
+ for (int i=0; i < getVecSize(FInfo); ++i) {
+ double y = (getArgType(FInfo) == AMDGPULibFunc::F32)
+ ? (double)CDV->getElementAsFloat(i)
+ : CDV->getElementAsDouble(i);
+ if (y != (double)(int64_t)y)
+ return false;
+ }
+ } else
+ return false;
+ }
+ }
+
+ Value *nval;
+ if (needabs) {
+ Constant *AbsExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS,
+ FInfo));
+ if (!AbsExpr)
+ return false;
+ nval = CreateCallEx(B, AbsExpr, opr0, "__fabs");
+ } else {
+ nval = cnval ? cnval : opr0;
+ }
+ if (needlog) {
+ Constant *LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2,
+ FInfo));
+ if (!LogExpr)
+ return false;
+ nval = CreateCallEx(B,LogExpr, nval, "__log2");
+ }
+
+ if (FInfo.getId() == AMDGPULibFunc::EI_POWN) {
+ // convert int(32) to fp(f32 or f64)
+ opr1 = B.CreateSIToFP(opr1, nval->getType(), "pownI2F");
+ }
+ nval = B.CreateFMul(opr1, nval, "__ylogx");
+ nval = CreateCallEx(B,ExpExpr, nval, "__exp2");
+
+ if (needcopysign) {
+ Value *opr_n;
+ Type* rTy = opr0->getType();
+ Type* nTyS = eltType->isDoubleTy() ? B.getInt64Ty() : B.getInt32Ty();
+ Type *nTy = nTyS;
+ if (const VectorType *vTy = dyn_cast<VectorType>(rTy))
+ nTy = VectorType::get(nTyS, vTy->getNumElements());
+ unsigned size = nTy->getScalarSizeInBits();
+ opr_n = CI->getArgOperand(1);
+ if (opr_n->getType()->isIntegerTy())
+ opr_n = B.CreateZExtOrBitCast(opr_n, nTy, "__ytou");
+ else
+ opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
+
+ Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
+ sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");
+ nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign);
+ nval = B.CreateBitCast(nval, opr0->getType());
+ }
+
+ DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
+ replaceCall(nval);
+
+ return true;
+}
+
+bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
+ const FuncInfo &FInfo) {
+ Value *opr0 = CI->getArgOperand(0);
+ Value *opr1 = CI->getArgOperand(1);
+
+ ConstantInt *CINT = dyn_cast<ConstantInt>(opr1);
+ if (!CINT) {
+ return false;
+ }
+ int ci_opr1 = (int)CINT->getSExtValue();
+ if (ci_opr1 == 1) { // rootn(x, 1) = x
+ DEBUG(errs() << "AMDIC: " << *CI
+ << " ---> " << *opr0 << "\n");
+ replaceCall(opr0);
+ return true;
+ }
+ if (ci_opr1 == 2) { // rootn(x, 2) = sqrt(x)
+ std::vector<const Type*> ParamsTys;
+ ParamsTys.push_back(opr0->getType());
+ Module *M = CI->getModule();
+ if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT,
+ FInfo))) {
+ DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
+ Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt");
+ replaceCall(nval);
+ return true;
+ }
+ } else if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
+ Module *M = CI->getModule();
+ if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT,
+ FInfo))) {
+ DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
+ Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt");
+ replaceCall(nval);
+ return true;
+ }
+ } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
+ DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n");
+ Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0),
+ opr0,
+ "__rootn2div");
+ replaceCall(nval);
+ return true;
+ } else if (ci_opr1 == -2) { // rootn(x, -2) = rsqrt(x)
+ std::vector<const Type*> ParamsTys;
+ ParamsTys.push_back(opr0->getType());
+ Module *M = CI->getModule();
+ if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT,
+ FInfo))) {
+ DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0 << ")\n");
+ Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt");
+ replaceCall(nval);
+ return true;
+ }
+ }
+ return false;
+}
+
+bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
+ const FuncInfo &FInfo) {
+ Value *opr0 = CI->getArgOperand(0);
+ Value *opr1 = CI->getArgOperand(1);
+ Value *opr2 = CI->getArgOperand(2);
+
+ ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
+ ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
+ if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) {
+ // fma/mad(a, b, c) = c if a=0 || b=0
+ DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
+ replaceCall(opr2);
+ return true;
+ }
+ if (CF0 && CF0->isExactlyValue(1.0f)) {
+ // fma/mad(a, b, c) = b+c if a=1
+ DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << *opr1 << " + " << *opr2 << "\n");
+ Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd");
+ replaceCall(nval);
+ return true;
+ }
+ if (CF1 && CF1->isExactlyValue(1.0f)) {
+ // fma/mad(a, b, c) = a+c if b=1
+ DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << *opr0 << " + " << *opr2 << "\n");
+ Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd");
+ replaceCall(nval);
+ return true;
+ }
+ if (ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) {
+ if (CF->isZero()) {
+ // fma/mad(a, b, c) = a*b if c=0
+ DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << *opr0 << " * " << *opr1 << "\n");
+ Value *nval = B.CreateFMul(opr0, opr1, "fmamul");
+ replaceCall(nval);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Get a scalar native builtin signle argument FP function
+Constant* AMDGPULibCalls::getNativeFunction(Module* M, const FuncInfo& FInfo) {
+ if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()))
+ return nullptr;
+ FuncInfo nf = FInfo;
+ nf.setPrefix(AMDGPULibFunc::NATIVE);
+ return getFunction(M, nf);
+}
+
+// fold sqrt -> native_sqrt (x)
+bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B,
+ const FuncInfo &FInfo) {
+ if (getArgType(FInfo) == AMDGPULibFunc::F32 && (getVecSize(FInfo) == 1) &&
+ (FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) {
+ if (Constant *FPExpr = getNativeFunction(
+ CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
+ Value *opr0 = CI->getArgOperand(0);
+ DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ << "sqrt(" << *opr0 << ")\n");
+ Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt");
+ replaceCall(nval);
+ return true;
+ }
+ }
+ return false;
+}
+
+// fold sin, cos -> sincos.
+bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
+ AliasAnalysis *AA) {
+ AMDGPULibFunc fInfo;
+ if (!AMDGPULibFunc::parse(CI->getCalledFunction()->getName(), fInfo))
+ return false;
+
+ assert(fInfo.getId() == AMDGPULibFunc::EI_SIN ||
+ fInfo.getId() == AMDGPULibFunc::EI_COS);
+ bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
+
+ Value *CArgVal = CI->getArgOperand(0);
+ BasicBlock * const CBB = CI->getParent();
+
+ int const MaxScan = 30;
+
+ { // fold in load value.
+ LoadInst *LI = dyn_cast<LoadInst>(CArgVal);
+ if (LI && LI->getParent() == CBB) {
+ BasicBlock::iterator BBI = LI->getIterator();
+ Value *AvailableVal = FindAvailableLoadedValue(LI, CBB, BBI, MaxScan, AA);
+ if (AvailableVal) {
+ CArgVal->replaceAllUsesWith(AvailableVal);
+ if (CArgVal->getNumUses() == 0)
+ LI->eraseFromParent();
+ CArgVal = CI->getArgOperand(0);
+ }
+ }
+ }
+
+ Module *M = CI->getModule();
+ fInfo.setId(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN);
+ std::string const PairName = fInfo.mangle();
+
+ CallInst *UI = nullptr;
+ for (User* U : CArgVal->users()) {
+ CallInst *XI = dyn_cast_or_null<CallInst>(U);
+ if (!XI || XI == CI || XI->getParent() != CBB)
+ continue;
+
+ Function *UCallee = XI->getCalledFunction();
+ if (!UCallee || !UCallee->getName().equals(PairName))
+ continue;
+
+ BasicBlock::iterator BBI = CI->getIterator();
+ if (BBI == CI->getParent()->begin())
+ break;
+ --BBI;
+ for (int I = MaxScan; I > 0 && BBI != CBB->begin(); --BBI, --I) {
+ if (cast<Instruction>(BBI) == XI) {
+ UI = XI;
+ break;
+ }
+ }
+ if (UI) break;
+ }
+
+ if (!UI) return false;
+
+ // Merge the sin and cos.
+
+ // for OpenCL 2.0 we have only generic implementation of sincos
+ // function.
+ AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
+ const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*M);
+ nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS.FLAT_ADDRESS);
+ Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf));
+ if (!Fsincos) return false;
+
+ BasicBlock::iterator ItOld = B.GetInsertPoint();
+ AllocaInst *Alloc = insertAlloca(UI, B, "__sincos_");
+ B.SetInsertPoint(UI);
+
+ Value *P = Alloc;
+ Type *PTy = Fsincos->getFunctionType()->getParamType(1);
+ // The allocaInst allocates the memory in private address space. This need
+ // to be bitcasted to point to the address space of cos pointer type.
+ // In OpenCL 2.0 this is generic, while in 1.2 that is private.
+ if (PTy->getPointerAddressSpace() != AS.PRIVATE_ADDRESS)
+ P = B.CreateAddrSpaceCast(Alloc, PTy);
+ CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
+
+ DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI
+ << ") with " << *Call << "\n");
+
+ if (!isSin) { // CI->cos, UI->sin
+ B.SetInsertPoint(&*ItOld);
+ UI->replaceAllUsesWith(&*Call);
+ Instruction *Reload = B.CreateLoad(Alloc);
+ CI->replaceAllUsesWith(Reload);
+ UI->eraseFromParent();
+ CI->eraseFromParent();
+ } else { // CI->sin, UI->cos
+ Instruction *Reload = B.CreateLoad(Alloc);
+ UI->replaceAllUsesWith(Reload);
+ CI->replaceAllUsesWith(Call);
+ UI->eraseFromParent();
+ CI->eraseFromParent();
+ }
+ return true;
+}
+
+// Get insertion point at entry.
+BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
+ Function * Func = UI->getParent()->getParent();
+ BasicBlock * BB = &Func->getEntryBlock();
+ assert(BB && "Entry block not found!");
+ BasicBlock::iterator ItNew = BB->begin();
+ return ItNew;
+}
+
+// Insert a AllocsInst at the beginning of function entry block.
+AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
+ const char *prefix) {
+ BasicBlock::iterator ItNew = getEntryIns(UI);
+ Function *UCallee = UI->getCalledFunction();
+ Type *RetType = UCallee->getReturnType();
+ B.SetInsertPoint(&*ItNew);
+ AllocaInst *Alloc = B.CreateAlloca(RetType, 0,
+ std::string(prefix) + UI->getName());
+ Alloc->setAlignment(UCallee->getParent()->getDataLayout()
+ .getTypeAllocSize(RetType));
+ return Alloc;
+}
+
+bool AMDGPULibCalls::evaluateScalarMathFunc(FuncInfo &FInfo,
+ double& Res0, double& Res1,
+ Constant *copr0, Constant *copr1,
+ Constant *copr2) {
+ // By default, opr0/opr1/opr3 holds values of float/double type.
+ // If they are not float/double, each function has to its
+ // operand separately.
+ double opr0=0.0, opr1=0.0, opr2=0.0;
+ ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(copr0);
+ ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(copr1);
+ ConstantFP *fpopr2 = dyn_cast_or_null<ConstantFP>(copr2);
+ if (fpopr0) {
+ opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
+ ? fpopr0->getValueAPF().convertToDouble()
+ : (double)fpopr0->getValueAPF().convertToFloat();
+ }
+
+ if (fpopr1) {
+ opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
+ ? fpopr1->getValueAPF().convertToDouble()
+ : (double)fpopr1->getValueAPF().convertToFloat();
+ }
+
+ if (fpopr2) {
+ opr2 = (getArgType(FInfo) == AMDGPULibFunc::F64)
+ ? fpopr2->getValueAPF().convertToDouble()
+ : (double)fpopr2->getValueAPF().convertToFloat();
+ }
+
+ switch (FInfo.getId()) {
+ default : return false;
+
+ case AMDGPULibFunc::EI_ACOS:
+ Res0 = acos(opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_ACOSH:
+ // acosh(x) == log(x + sqrt(x*x - 1))
+ Res0 = log(opr0 + sqrt(opr0*opr0 - 1.0));
+ return true;
+
+ case AMDGPULibFunc::EI_ACOSPI:
+ Res0 = acos(opr0) / MATH_PI;
+ return true;
+
+ case AMDGPULibFunc::EI_ASIN:
+ Res0 = asin(opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_ASINH:
+ // asinh(x) == log(x + sqrt(x*x + 1))
+ Res0 = log(opr0 + sqrt(opr0*opr0 + 1.0));
+ return true;
+
+ case AMDGPULibFunc::EI_ASINPI:
+ Res0 = asin(opr0) / MATH_PI;
+ return true;
+
+ case AMDGPULibFunc::EI_ATAN:
+ Res0 = atan(opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_ATANH:
+ // atanh(x) == (log(x+1) - log(x-1))/2;
+ Res0 = (log(opr0 + 1.0) - log(opr0 - 1.0))/2.0;
+ return true;
+
+ case AMDGPULibFunc::EI_ATANPI:
+ Res0 = atan(opr0) / MATH_PI;
+ return true;
+
+ case AMDGPULibFunc::EI_CBRT:
+ Res0 = (opr0 < 0.0) ? -pow(-opr0, 1.0/3.0) : pow(opr0, 1.0/3.0);
+ return true;
+
+ case AMDGPULibFunc::EI_COS:
+ Res0 = cos(opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_COSH:
+ Res0 = cosh(opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_COSPI:
+ Res0 = cos(MATH_PI * opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_EXP:
+ Res0 = exp(opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_EXP2:
+ Res0 = pow(2.0, opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_EXP10:
+ Res0 = pow(10.0, opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_EXPM1:
+ Res0 = exp(opr0) - 1.0;
+ return true;
+
+ case AMDGPULibFunc::EI_LOG:
+ Res0 = log(opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_LOG2:
+ Res0 = log(opr0) / log(2.0);
+ return true;
+
+ case AMDGPULibFunc::EI_LOG10:
+ Res0 = log(opr0) / log(10.0);
+ return true;
+
+ case AMDGPULibFunc::EI_RSQRT:
+ Res0 = 1.0 / sqrt(opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_SIN:
+ Res0 = sin(opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_SINH:
+ Res0 = sinh(opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_SINPI:
+ Res0 = sin(MATH_PI * opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_SQRT:
+ Res0 = sqrt(opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_TAN:
+ Res0 = tan(opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_TANH:
+ Res0 = tanh(opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_TANPI:
+ Res0 = tan(MATH_PI * opr0);
+ return true;
+
+ case AMDGPULibFunc::EI_RECIP:
+ Res0 = 1.0 / opr0;
+ return true;
+
+ // two-arg functions
+ case AMDGPULibFunc::EI_DIVIDE:
+ Res0 = opr0 / opr1;
+ return true;
+
+ case AMDGPULibFunc::EI_POW:
+ case AMDGPULibFunc::EI_POWR:
+ Res0 = pow(opr0, opr1);
+ return true;
+
+ case AMDGPULibFunc::EI_POWN: {
+ if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
+ double val = (double)iopr1->getSExtValue();
+ Res0 = pow(opr0, val);
+ return true;
+ }
+ return false;
+ }
+
+ case AMDGPULibFunc::EI_ROOTN: {
+ if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
+ double val = (double)iopr1->getSExtValue();
+ Res0 = pow(opr0, 1.0 / val);
+ return true;
+ }
+ return false;
+ }
+
+ // with ptr arg
+ case AMDGPULibFunc::EI_SINCOS:
+ Res0 = sin(opr0);
+ Res1 = cos(opr0);
+ return true;
+
+ // three-arg functions
+ case AMDGPULibFunc::EI_FMA:
+ case AMDGPULibFunc::EI_MAD:
+ Res0 = opr0 * opr1 + opr2;
+ return true;
+ }
+
+ return false;
+}
+
+bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) {
+ int numArgs = (int)aCI->getNumArgOperands();
+ if (numArgs > 3)
+ return false;
+
+ Constant *copr0 = nullptr;
+ Constant *copr1 = nullptr;
+ Constant *copr2 = nullptr;
+ if (numArgs > 0) {
+ if ((copr0 = dyn_cast<Constant>(aCI->getArgOperand(0))) == nullptr)
+ return false;
+ }
+
+ if (numArgs > 1) {
+ if ((copr1 = dyn_cast<Constant>(aCI->getArgOperand(1))) == nullptr) {
+ if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
+ return false;
+ }
+ }
+
+ if (numArgs > 2) {
+ if ((copr2 = dyn_cast<Constant>(aCI->getArgOperand(2))) == nullptr)
+ return false;
+ }
+
+ // At this point, all arguments to aCI are constants.
+
+ // max vector size is 16, and sincos will generate two results.
+ double DVal0[16], DVal1[16];
+ bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
+ if (getVecSize(FInfo) == 1) {
+ if (!evaluateScalarMathFunc(FInfo, DVal0[0],
+ DVal1[0], copr0, copr1, copr2)) {
+ return false;
+ }
+ } else {
+ ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(copr0);
+ ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(copr1);
+ ConstantDataVector *CDV2 = dyn_cast_or_null<ConstantDataVector>(copr2);
+ for (int i=0; i < getVecSize(FInfo); ++i) {
+ Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
+ Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
+ Constant *celt2 = CDV2 ? CDV2->getElementAsConstant(i) : nullptr;
+ if (!evaluateScalarMathFunc(FInfo, DVal0[i],
+ DVal1[i], celt0, celt1, celt2)) {
+ return false;
+ }
+ }
+ }
+
+ LLVMContext &context = CI->getParent()->getParent()->getContext();
+ Constant *nval0, *nval1;
+ if (getVecSize(FInfo) == 1) {
+ nval0 = ConstantFP::get(CI->getType(), DVal0[0]);
+ if (hasTwoResults)
+ nval1 = ConstantFP::get(CI->getType(), DVal1[0]);
+ } else {
+ if (getArgType(FInfo) == AMDGPULibFunc::F32) {
+ SmallVector <float, 0> FVal0, FVal1;
+ for (int i=0; i < getVecSize(FInfo); ++i)
+ FVal0.push_back((float)DVal0[i]);
+ ArrayRef<float> tmp0(FVal0);
+ nval0 = ConstantDataVector::get(context, tmp0);
+ if (hasTwoResults) {
+ for (int i=0; i < getVecSize(FInfo); ++i)
+ FVal1.push_back((float)DVal1[i]);
+ ArrayRef<float> tmp1(FVal1);
+ nval1 = ConstantDataVector::get(context, tmp1);
+ }
+ } else {
+ ArrayRef<double> tmp0(DVal0);
+ nval0 = ConstantDataVector::get(context, tmp0);
+ if (hasTwoResults) {
+ ArrayRef<double> tmp1(DVal1);
+ nval1 = ConstantDataVector::get(context, tmp1);
+ }
+ }
+ }
+
+ if (hasTwoResults) {
+ // sincos
+ assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
+ "math function with ptr arg not supported yet");
+ new StoreInst(nval1, aCI->getArgOperand(1), aCI);
+ }
+
+ replaceCall(nval0);
+ return true;
+}
+
+// Public interface to the Simplify LibCalls pass.
+FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) {
+ return new AMDGPUSimplifyLibCalls(Opt);
+}
+
+FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
+ return new AMDGPUUseNativeCalls();
+}
+
+static bool setFastFlags(Function &F, const TargetOptions &Options) {
+ AttrBuilder B;
+
+ if (Options.UnsafeFPMath || Options.NoInfsFPMath)
+ B.addAttribute("no-infs-fp-math", "true");
+ if (Options.UnsafeFPMath || Options.NoNaNsFPMath)
+ B.addAttribute("no-nans-fp-math", "true");
+ if (Options.UnsafeFPMath) {
+ B.addAttribute("less-precise-fpmad", "true");
+ B.addAttribute("unsafe-fp-math", "true");
+ }
+
+ if (!B.hasAttributes())
+ return false;
+
+ F.addAttributes(AttributeList::FunctionIndex, B);
+
+ return true;
+}
+
+bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ bool Changed = false;
+ auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+ DEBUG(dbgs() << "AMDIC: process function ";
+ F.printAsOperand(dbgs(), false, F.getParent());
+ dbgs() << '\n';);
+
+ if (!EnablePreLink)
+ Changed |= setFastFlags(F, Options);
+
+ for (auto &BB : F) {
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) {
+ // Ignore non-calls.
+ CallInst *CI = dyn_cast<CallInst>(I);
+ ++I;
+ if (!CI) continue;
+
+ // Ignore indirect calls.
+ Function *Callee = CI->getCalledFunction();
+ if (Callee == 0) continue;
+
+ DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
+ dbgs().flush());
+ if(Simplifier.fold(CI, AA))
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+bool AMDGPUUseNativeCalls::runOnFunction(Function &F) {
+ if (skipFunction(F) || UseNative.empty())
+ return false;
+
+ bool Changed = false;
+ for (auto &BB : F) {
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) {
+ // Ignore non-calls.
+ CallInst *CI = dyn_cast<CallInst>(I);
+ ++I;
+ if (!CI) continue;
+
+ // Ignore indirect calls.
+ Function *Callee = CI->getCalledFunction();
+ if (Callee == 0) continue;
+
+ if(Simplifier.useNative(CI))
+ Changed = true;
+ }
+ }
+ return Changed;
+}
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
new file mode 100644
index 000000000000..4671273d61f9
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -0,0 +1,1054 @@
+//===-- AMDGPULibFunc.cpp -------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utility functions to work with Itanium mangled names
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPULibFunc.h"
+#include <llvm/ADT/SmallString.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringSwitch.h>
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include <llvm/Support/raw_ostream.h>
+#include <string>
+
+using namespace llvm;
+
+namespace {
+
+enum EManglingParam {
+ E_NONE,
+ EX_EVENT,
+ EX_FLOAT4,
+ EX_INTV4,
+ EX_RESERVEDID,
+ EX_SAMPLER,
+ EX_SIZET,
+ EX_UINT,
+ EX_UINTV4,
+ E_ANY,
+ E_CONSTPTR_ANY,
+ E_CONSTPTR_SWAPGL,
+ E_COPY,
+ E_IMAGECOORDS,
+ E_POINTEE,
+ E_SETBASE_I32,
+ E_SETBASE_U32,
+ E_MAKEBASE_UNS,
+ E_V16_OF_POINTEE,
+ E_V2_OF_POINTEE,
+ E_V3_OF_POINTEE,
+ E_V4_OF_POINTEE,
+ E_V8_OF_POINTEE,
+ E_VLTLPTR_ANY,
+};
+
+struct ManglingRule {
+ StringRef const Name;
+ unsigned char Lead[2];
+ unsigned char Param[5];
+
+ int maxLeadIndex() const { return (std::max)(Lead[0], Lead[1]); }
+ int getNumLeads() const { return (Lead[0] ? 1 : 0) + (Lead[1] ? 1 : 0); }
+
+ unsigned getNumArgs() const;
+};
+
+// Information about library functions with unmangled names.
+class UnmangledFuncInfo {
+ StringRef const Name;
+ unsigned NumArgs;
+
+ // Table for all lib functions with unmangled names.
+ static const UnmangledFuncInfo Table[];
+
+ // Number of entries in Table.
+ static const unsigned TableSize;
+
+ // Map function name to index.
+ class NameMap : public StringMap<unsigned> {
+ public:
+ NameMap() {
+ for (unsigned I = 0; I != TableSize; ++I)
+ (*this)[Table[I].Name] = I;
+ }
+ };
+ friend class NameMap;
+ static NameMap Map;
+
+public:
+ using ID = AMDGPULibFunc::EFuncId;
+ UnmangledFuncInfo() = default;
+ UnmangledFuncInfo(StringRef _Name, unsigned _NumArgs)
+ : Name(_Name), NumArgs(_NumArgs) {}
+ // Get index to Table by function name.
+ static bool lookup(StringRef Name, ID &Id);
+ static unsigned toIndex(ID Id) {
+ assert(static_cast<unsigned>(Id) >
+ static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED) &&
+ "Invalid unmangled library function");
+ return static_cast<unsigned>(Id) - 1 -
+ static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED);
+ }
+ static ID toFuncId(unsigned Index) {
+ assert(Index < TableSize && "Invalid unmangled library function");
+ return static_cast<ID>(
+ Index + 1 + static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED));
+ }
+ static unsigned getNumArgs(ID Id) { return Table[toIndex(Id)].NumArgs; }
+ static StringRef getName(ID Id) { return Table[toIndex(Id)].Name; }
+};
+
+unsigned ManglingRule::getNumArgs() const {
+ unsigned I=0;
+ while (I < (sizeof Param/sizeof Param[0]) && Param[I]) ++I;
+ return I;
+}
+
+// This table describes function formal argument type rules. The order of rules
+// corresponds to the EFuncId enum at AMDGPULibFunc.h
+//
+// "<func name>", { <leads> }, { <param rules> }
+// where:
+// <leads> - list of integers that are one-based indexes of formal argument
+// used to mangle a function name. Other argument types are derived from types
+// of these 'leads'. The order of integers in this list correspond to the
+// order in which these arguments are mangled in the EDG mangling scheme. The
+// same order should be preserved for arguments in the AMDGPULibFunc structure
+// when it is used for mangling. For example:
+// { "vstorea_half", {3,1}, {E_ANY,EX_SIZET,E_ANY}},
+// will be mangled in EDG scheme as vstorea_half_<3dparam>_<1stparam>
+// When mangling from code use:
+// AMDGPULibFunc insc;
+// insc.param[0] = ... // describe 3rd parameter
+// insc.param[1] = ... // describe 1rd parameter
+//
+// <param rules> - list of rules used to derive all of the function formal
+// argument types. EX_ prefixed are simple types, other derived from the
+// latest 'lead' argument type in the order of encoding from first to last.
+// E_ANY - use prev lead type, E_CONSTPTR_ANY - make const pointer out of
+// prev lead type, etc. see ParamIterator::getNextParam() for details.
+
+static const ManglingRule manglingRules[] = {
+{ StringRef(), {0}, {0} },
+{ "abs" , {1}, {E_ANY}},
+{ "abs_diff" , {1}, {E_ANY,E_COPY}},
+{ "acos" , {1}, {E_ANY}},
+{ "acosh" , {1}, {E_ANY}},
+{ "acospi" , {1}, {E_ANY}},
+{ "add_sat" , {1}, {E_ANY,E_COPY}},
+{ "all" , {1}, {E_ANY}},
+{ "any" , {1}, {E_ANY}},
+{ "asin" , {1}, {E_ANY}},
+{ "asinh" , {1}, {E_ANY}},
+{ "asinpi" , {1}, {E_ANY}},
+{ "async_work_group_copy" , {1}, {E_ANY,E_CONSTPTR_SWAPGL,EX_SIZET,EX_EVENT}},
+{ "async_work_group_strided_copy" , {1}, {E_ANY,E_CONSTPTR_SWAPGL,EX_SIZET,EX_SIZET,EX_EVENT}},
+{ "atan" , {1}, {E_ANY}},
+{ "atan2" , {1}, {E_ANY,E_COPY}},
+{ "atan2pi" , {1}, {E_ANY,E_COPY}},
+{ "atanh" , {1}, {E_ANY}},
+{ "atanpi" , {1}, {E_ANY}},
+{ "atomic_add" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_and" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_cmpxchg" , {1}, {E_VLTLPTR_ANY,E_POINTEE,E_POINTEE}},
+{ "atomic_dec" , {1}, {E_VLTLPTR_ANY}},
+{ "atomic_inc" , {1}, {E_VLTLPTR_ANY}},
+{ "atomic_max" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_min" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_or" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_sub" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_xchg" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_xor" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
+{ "bitselect" , {1}, {E_ANY,E_COPY,E_COPY}},
+{ "cbrt" , {1}, {E_ANY}},
+{ "ceil" , {1}, {E_ANY}},
+{ "clamp" , {1}, {E_ANY,E_COPY,E_COPY}},
+{ "clz" , {1}, {E_ANY}},
+{ "commit_read_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
+{ "commit_write_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
+{ "copysign" , {1}, {E_ANY,E_COPY}},
+{ "cos" , {1}, {E_ANY}},
+{ "cosh" , {1}, {E_ANY}},
+{ "cospi" , {1}, {E_ANY}},
+{ "cross" , {1}, {E_ANY,E_COPY}},
+{ "ctz" , {1}, {E_ANY}},
+{ "degrees" , {1}, {E_ANY}},
+{ "distance" , {1}, {E_ANY,E_COPY}},
+{ "divide" , {1}, {E_ANY,E_COPY}},
+{ "dot" , {1}, {E_ANY,E_COPY}},
+{ "erf" , {1}, {E_ANY}},
+{ "erfc" , {1}, {E_ANY}},
+{ "exp" , {1}, {E_ANY}},
+{ "exp10" , {1}, {E_ANY}},
+{ "exp2" , {1}, {E_ANY}},
+{ "expm1" , {1}, {E_ANY}},
+{ "fabs" , {1}, {E_ANY}},
+{ "fast_distance" , {1}, {E_ANY,E_COPY}},
+{ "fast_length" , {1}, {E_ANY}},
+{ "fast_normalize" , {1}, {E_ANY}},
+{ "fdim" , {1}, {E_ANY,E_COPY}},
+{ "floor" , {1}, {E_ANY}},
+{ "fma" , {1}, {E_ANY,E_COPY,E_COPY}},
+{ "fmax" , {1}, {E_ANY,E_COPY}},
+{ "fmin" , {1}, {E_ANY,E_COPY}},
+{ "fmod" , {1}, {E_ANY,E_COPY}},
+{ "fract" , {2}, {E_POINTEE,E_ANY}},
+{ "frexp" , {1,2}, {E_ANY,E_ANY}},
+{ "get_image_array_size" , {1}, {E_ANY}},
+{ "get_image_channel_data_type" , {1}, {E_ANY}},
+{ "get_image_channel_order" , {1}, {E_ANY}},
+{ "get_image_dim" , {1}, {E_ANY}},
+{ "get_image_height" , {1}, {E_ANY}},
+{ "get_image_width" , {1}, {E_ANY}},
+{ "get_pipe_max_packets" , {1}, {E_ANY}},
+{ "get_pipe_num_packets" , {1}, {E_ANY}},
+{ "hadd" , {1}, {E_ANY,E_COPY}},
+{ "hypot" , {1}, {E_ANY,E_COPY}},
+{ "ilogb" , {1}, {E_ANY}},
+{ "isequal" , {1}, {E_ANY,E_COPY}},
+{ "isfinite" , {1}, {E_ANY}},
+{ "isgreater" , {1}, {E_ANY,E_COPY}},
+{ "isgreaterequal" , {1}, {E_ANY,E_COPY}},
+{ "isinf" , {1}, {E_ANY}},
+{ "isless" , {1}, {E_ANY,E_COPY}},
+{ "islessequal" , {1}, {E_ANY,E_COPY}},
+{ "islessgreater" , {1}, {E_ANY,E_COPY}},
+{ "isnan" , {1}, {E_ANY}},
+{ "isnormal" , {1}, {E_ANY}},
+{ "isnotequal" , {1}, {E_ANY,E_COPY}},
+{ "isordered" , {1}, {E_ANY,E_COPY}},
+{ "isunordered" , {1}, {E_ANY,E_COPY}},
+{ "ldexp" , {1}, {E_ANY,E_SETBASE_I32}},
+{ "length" , {1}, {E_ANY}},
+{ "lgamma" , {1}, {E_ANY}},
+{ "lgamma_r" , {1,2}, {E_ANY,E_ANY}},
+{ "log" , {1}, {E_ANY}},
+{ "log10" , {1}, {E_ANY}},
+{ "log1p" , {1}, {E_ANY}},
+{ "log2" , {1}, {E_ANY}},
+{ "logb" , {1}, {E_ANY}},
+{ "mad" , {1}, {E_ANY,E_COPY,E_COPY}},
+{ "mad24" , {1}, {E_ANY,E_COPY,E_COPY}},
+{ "mad_hi" , {1}, {E_ANY,E_COPY,E_COPY}},
+{ "mad_sat" , {1}, {E_ANY,E_COPY,E_COPY}},
+{ "max" , {1}, {E_ANY,E_COPY}},
+{ "maxmag" , {1}, {E_ANY,E_COPY}},
+{ "min" , {1}, {E_ANY,E_COPY}},
+{ "minmag" , {1}, {E_ANY,E_COPY}},
+{ "mix" , {1}, {E_ANY,E_COPY,E_COPY}},
+{ "modf" , {2}, {E_POINTEE,E_ANY}},
+{ "mul24" , {1}, {E_ANY,E_COPY}},
+{ "mul_hi" , {1}, {E_ANY,E_COPY}},
+{ "nan" , {1}, {E_ANY}},
+{ "nextafter" , {1}, {E_ANY,E_COPY}},
+{ "normalize" , {1}, {E_ANY}},
+{ "popcount" , {1}, {E_ANY}},
+{ "pow" , {1}, {E_ANY,E_COPY}},
+{ "pown" , {1}, {E_ANY,E_SETBASE_I32}},
+{ "powr" , {1}, {E_ANY,E_COPY}},
+{ "prefetch" , {1}, {E_CONSTPTR_ANY,EX_SIZET}},
+{ "radians" , {1}, {E_ANY}},
+{ "recip" , {1}, {E_ANY}},
+{ "remainder" , {1}, {E_ANY,E_COPY}},
+{ "remquo" , {1,3}, {E_ANY,E_COPY,E_ANY}},
+{ "reserve_read_pipe" , {1}, {E_ANY,EX_UINT}},
+{ "reserve_write_pipe" , {1}, {E_ANY,EX_UINT}},
+{ "rhadd" , {1}, {E_ANY,E_COPY}},
+{ "rint" , {1}, {E_ANY}},
+{ "rootn" , {1}, {E_ANY,E_SETBASE_I32}},
+{ "rotate" , {1}, {E_ANY,E_COPY}},
+{ "round" , {1}, {E_ANY}},
+{ "rsqrt" , {1}, {E_ANY}},
+{ "select" , {1,3}, {E_ANY,E_COPY,E_ANY}},
+{ "shuffle" , {1,2}, {E_ANY,E_ANY}},
+{ "shuffle2" , {1,3}, {E_ANY,E_COPY,E_ANY}},
+{ "sign" , {1}, {E_ANY}},
+{ "signbit" , {1}, {E_ANY}},
+{ "sin" , {1}, {E_ANY}},
+{ "sincos" , {2}, {E_POINTEE,E_ANY}},
+{ "sinh" , {1}, {E_ANY}},
+{ "sinpi" , {1}, {E_ANY}},
+{ "smoothstep" , {1}, {E_ANY,E_COPY,E_COPY}},
+{ "sqrt" , {1}, {E_ANY}},
+{ "step" , {1}, {E_ANY,E_COPY}},
+{ "sub_group_broadcast" , {1}, {E_ANY,EX_UINT}},
+{ "sub_group_commit_read_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
+{ "sub_group_commit_write_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
+{ "sub_group_reduce_add" , {1}, {E_ANY}},
+{ "sub_group_reduce_max" , {1}, {E_ANY}},
+{ "sub_group_reduce_min" , {1}, {E_ANY}},
+{ "sub_group_reserve_read_pipe" , {1}, {E_ANY,EX_UINT}},
+{ "sub_group_reserve_write_pipe" , {1}, {E_ANY,EX_UINT}},
+{ "sub_group_scan_exclusive_add" , {1}, {E_ANY}},
+{ "sub_group_scan_exclusive_max" , {1}, {E_ANY}},
+{ "sub_group_scan_exclusive_min" , {1}, {E_ANY}},
+{ "sub_group_scan_inclusive_add" , {1}, {E_ANY}},
+{ "sub_group_scan_inclusive_max" , {1}, {E_ANY}},
+{ "sub_group_scan_inclusive_min" , {1}, {E_ANY}},
+{ "sub_sat" , {1}, {E_ANY,E_COPY}},
+{ "tan" , {1}, {E_ANY}},
+{ "tanh" , {1}, {E_ANY}},
+{ "tanpi" , {1}, {E_ANY}},
+{ "tgamma" , {1}, {E_ANY}},
+{ "trunc" , {1}, {E_ANY}},
+{ "upsample" , {1}, {E_ANY,E_MAKEBASE_UNS}},
+{ "vec_step" , {1}, {E_ANY}},
+{ "vstore" , {3}, {E_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore16" , {3}, {E_V16_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore2" , {3}, {E_V2_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore3" , {3}, {E_V3_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore4" , {3}, {E_V4_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore8" , {3}, {E_V8_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "work_group_commit_read_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
+{ "work_group_commit_write_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
+{ "work_group_reduce_add" , {1}, {E_ANY}},
+{ "work_group_reduce_max" , {1}, {E_ANY}},
+{ "work_group_reduce_min" , {1}, {E_ANY}},
+{ "work_group_reserve_read_pipe" , {1}, {E_ANY,EX_UINT}},
+{ "work_group_reserve_write_pipe" , {1}, {E_ANY,EX_UINT}},
+{ "work_group_scan_exclusive_add" , {1}, {E_ANY}},
+{ "work_group_scan_exclusive_max" , {1}, {E_ANY}},
+{ "work_group_scan_exclusive_min" , {1}, {E_ANY}},
+{ "work_group_scan_inclusive_add" , {1}, {E_ANY}},
+{ "work_group_scan_inclusive_max" , {1}, {E_ANY}},
+{ "work_group_scan_inclusive_min" , {1}, {E_ANY}},
+{ "write_imagef" , {1}, {E_ANY,E_IMAGECOORDS,EX_FLOAT4}},
+{ "write_imagei" , {1}, {E_ANY,E_IMAGECOORDS,EX_INTV4}},
+{ "write_imageui" , {1}, {E_ANY,E_IMAGECOORDS,EX_UINTV4}},
+{ "ncos" , {1}, {E_ANY} },
+{ "nexp2" , {1}, {E_ANY} },
+{ "nfma" , {1}, {E_ANY, E_COPY, E_COPY} },
+{ "nlog2" , {1}, {E_ANY} },
+{ "nrcp" , {1}, {E_ANY} },
+{ "nrsqrt" , {1}, {E_ANY} },
+{ "nsin" , {1}, {E_ANY} },
+{ "nsqrt" , {1}, {E_ANY} },
+{ "ftz" , {1}, {E_ANY} },
+{ "fldexp" , {1}, {E_ANY, EX_UINT} },
+{ "class" , {1}, {E_ANY, EX_UINT} },
+{ "rcbrt" , {1}, {E_ANY} },
+};
+
+// Library functions with unmangled name.
+const UnmangledFuncInfo UnmangledFuncInfo::Table[] = {
+ {"__read_pipe_2", 4},
+ {"__read_pipe_4", 6},
+ {"__write_pipe_2", 4},
+ {"__write_pipe_4", 6},
+};
+
+const unsigned UnmangledFuncInfo::TableSize =
+ sizeof(UnmangledFuncInfo::Table) / sizeof(UnmangledFuncInfo::Table[0]);
+
+UnmangledFuncInfo::NameMap UnmangledFuncInfo::Map;
+
+static const struct ManglingRulesMap : public StringMap<int> {
+ ManglingRulesMap()
+ : StringMap<int>(sizeof(manglingRules)/sizeof(manglingRules[0])) {
+ int Id = 0;
+ for (auto Rule : manglingRules)
+ insert({ Rule.Name, Id++ });
+ }
+} manglingRulesMap;
+
+static AMDGPULibFunc::Param getRetType(AMDGPULibFunc::EFuncId id,
+ const AMDGPULibFunc::Param (&Leads)[2]) {
+ AMDGPULibFunc::Param Res = Leads[0];
+ // TBD - This switch may require to be extended for other intriniscs
+ switch (id) {
+ case AMDGPULibFunc::EI_SINCOS:
+ Res.PtrKind = AMDGPULibFunc::BYVALUE;
+ break;
+ default:
+ break;
+ }
+ return Res;
+}
+
+class ParamIterator {
+ const AMDGPULibFunc::Param (&Leads)[2];
+ const ManglingRule& Rule;
+ int Index;
+public:
+ ParamIterator(const AMDGPULibFunc::Param (&leads)[2],
+ const ManglingRule& rule)
+ : Leads(leads), Rule(rule), Index(0) {}
+
+ AMDGPULibFunc::Param getNextParam();
+};
+
+AMDGPULibFunc::Param ParamIterator::getNextParam() {
+ AMDGPULibFunc::Param P;
+ if (Index >= int(sizeof Rule.Param/sizeof Rule.Param[0])) return P;
+
+ const char R = Rule.Param[Index];
+ switch (R) {
+ case E_NONE: break;
+ case EX_UINT:
+ P.ArgType = AMDGPULibFunc::U32; break;
+ case EX_INTV4:
+ P.ArgType = AMDGPULibFunc::I32; P.VectorSize = 4; break;
+ case EX_UINTV4:
+ P.ArgType = AMDGPULibFunc::U32; P.VectorSize = 4; break;
+ case EX_FLOAT4:
+ P.ArgType = AMDGPULibFunc::F32; P.VectorSize = 4; break;
+ case EX_SIZET:
+ P.ArgType = AMDGPULibFunc::U64; break;
+ case EX_EVENT:
+ P.ArgType = AMDGPULibFunc::EVENT; break;
+ case EX_SAMPLER:
+ P.ArgType = AMDGPULibFunc::SAMPLER; break;
+ case EX_RESERVEDID: break; // TBD
+ default:
+ if (Index == (Rule.Lead[1] - 1)) P = Leads[1];
+ else P = Leads[0];
+
+ switch (R) {
+ case E_ANY:
+ case E_COPY: break;
+
+ case E_POINTEE:
+ P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+ case E_V2_OF_POINTEE:
+ P.VectorSize = 2; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+ case E_V3_OF_POINTEE:
+ P.VectorSize = 3; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+ case E_V4_OF_POINTEE:
+ P.VectorSize = 4; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+ case E_V8_OF_POINTEE:
+ P.VectorSize = 8; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+ case E_V16_OF_POINTEE:
+ P.VectorSize = 16; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+ case E_CONSTPTR_ANY:
+ P.PtrKind |= AMDGPULibFunc::CONST; break;
+ case E_VLTLPTR_ANY:
+ P.PtrKind |= AMDGPULibFunc::VOLATILE; break;
+ case E_SETBASE_I32:
+ P.ArgType = AMDGPULibFunc::I32; break;
+ case E_SETBASE_U32:
+ P.ArgType = AMDGPULibFunc::U32; break;
+
+ case E_MAKEBASE_UNS:
+ P.ArgType &= ~AMDGPULibFunc::BASE_TYPE_MASK;
+ P.ArgType |= AMDGPULibFunc::UINT;
+ break;
+
+ case E_IMAGECOORDS:
+ switch (P.ArgType) {
+ case AMDGPULibFunc::IMG1DA: P.VectorSize = 2; break;
+ case AMDGPULibFunc::IMG1DB: P.VectorSize = 1; break;
+ case AMDGPULibFunc::IMG2DA: P.VectorSize = 4; break;
+ case AMDGPULibFunc::IMG1D: P.VectorSize = 1; break;
+ case AMDGPULibFunc::IMG2D: P.VectorSize = 2; break;
+ case AMDGPULibFunc::IMG3D: P.VectorSize = 4; break;
+ }
+ P.PtrKind = AMDGPULibFunc::BYVALUE;
+ P.ArgType = AMDGPULibFunc::I32;
+ break;
+
+ case E_CONSTPTR_SWAPGL: {
+ unsigned AS = AMDGPULibFunc::getAddrSpaceFromEPtrKind(P.PtrKind);
+ switch (AS) {
+ case AMDGPUAS::GLOBAL_ADDRESS: AS = AMDGPUAS::LOCAL_ADDRESS; break;
+ case AMDGPUAS::LOCAL_ADDRESS: AS = AMDGPUAS::GLOBAL_ADDRESS; break;
+ }
+ P.PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS);
+ P.PtrKind |= AMDGPULibFunc::CONST;
+ break;
+ }
+
+ default: llvm_unreachable("Unhandeled param rule");
+ }
+ }
+ ++Index;
+ return P;
+}
+
+inline static void drop_front(StringRef& str, size_t n = 1) {
+ str = str.drop_front(n);
+}
+
+static bool eatTerm(StringRef& mangledName, const char c) {
+ if (mangledName.front() == c) {
+ drop_front(mangledName);
+ return true;
+ }
+ return false;
+}
+
+template <size_t N>
+static bool eatTerm(StringRef& mangledName, const char (&str)[N]) {
+ if (mangledName.startswith(StringRef(str, N-1))) {
+ drop_front(mangledName, N-1);
+ return true;
+ }
+ return false;
+}
+
+static inline bool isDigit(char c) { return c >= '0' && c <= '9'; }
+
+static int eatNumber(StringRef& s) {
+ size_t const savedSize = s.size();
+ int n = 0;
+ while (!s.empty() && isDigit(s.front())) {
+ n = n*10 + s.front() - '0';
+ drop_front(s);
+ }
+ return s.size() < savedSize ? n : -1;
+}
+
+static StringRef eatLengthPrefixedName(StringRef& mangledName) {
+ int const Len = eatNumber(mangledName);
+ if (Len <= 0 || static_cast<size_t>(Len) > mangledName.size())
+ return StringRef();
+ StringRef Res = mangledName.substr(0, Len);
+ drop_front(mangledName, Len);
+ return Res;
+}
+
+} // end anonymous namespace
+
+AMDGPUMangledLibFunc::AMDGPUMangledLibFunc() {
+ FuncId = EI_NONE;
+ FKind = NOPFX;
+ Leads[0].reset();
+ Leads[1].reset();
+ Name.clear();
+}
+
+AMDGPUUnmangledLibFunc::AMDGPUUnmangledLibFunc() {
+ FuncId = EI_NONE;
+ FuncTy = nullptr;
+}
+
+AMDGPUMangledLibFunc::AMDGPUMangledLibFunc(
+ EFuncId id, const AMDGPUMangledLibFunc &copyFrom) {
+ FuncId = id;
+ FKind = copyFrom.FKind;
+ Leads[0] = copyFrom.Leads[0];
+ Leads[1] = copyFrom.Leads[1];
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Demangling
+
+static int parseVecSize(StringRef& mangledName) {
+ size_t const Len = eatNumber(mangledName);
+ switch (Len) {
+ case 2: case 3: case 4: case 8: case 16:
+ return Len;
+ default:
+ break;
+ }
+ return 1;
+}
+
+static AMDGPULibFunc::ENamePrefix parseNamePrefix(StringRef& mangledName) {
+ std::pair<StringRef, StringRef> const P = mangledName.split('_');
+ AMDGPULibFunc::ENamePrefix Pfx =
+ StringSwitch<AMDGPULibFunc::ENamePrefix>(P.first)
+ .Case("native", AMDGPULibFunc::NATIVE)
+ .Case("half" , AMDGPULibFunc::HALF)
+ .Default(AMDGPULibFunc::NOPFX);
+
+ if (Pfx != AMDGPULibFunc::NOPFX)
+ mangledName = P.second;
+
+ return Pfx;
+}
+
+bool AMDGPUMangledLibFunc::parseUnmangledName(StringRef FullName) {
+ FuncId = static_cast<EFuncId>(manglingRulesMap.lookup(FullName));
+ return FuncId != EI_NONE;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Itanium Demangling
+
+namespace {
+struct ItaniumParamParser {
+ AMDGPULibFunc::Param Prev;
+ bool parseItaniumParam(StringRef& param, AMDGPULibFunc::Param &res);
+};
+} // namespace
+
+bool ItaniumParamParser::parseItaniumParam(StringRef& param,
+ AMDGPULibFunc::Param &res) {
+ res.reset();
+ if (param.empty()) return false;
+
+ // parse pointer prefix
+ if (eatTerm(param, 'P')) {
+ if (eatTerm(param, 'K')) res.PtrKind |= AMDGPULibFunc::CONST;
+ if (eatTerm(param, 'V')) res.PtrKind |= AMDGPULibFunc::VOLATILE;
+ unsigned AS;
+ if (!eatTerm(param, "U3AS")) {
+ AS = 0;
+ } else {
+ AS = param.front() - '0';
+ drop_front(param, 1);
+ }
+ res.PtrKind |= AMDGPULibFuncBase::getEPtrKindFromAddrSpace(AS);
+ } else {
+ res.PtrKind = AMDGPULibFunc::BYVALUE;
+ }
+
+ // parse vector size
+ if (eatTerm(param,"Dv")) {
+ res.VectorSize = parseVecSize(param);
+ if (res.VectorSize==1 || !eatTerm(param, '_')) return false;
+ }
+
+ // parse type
+ char const TC = param.front();
+ if (::isDigit(TC)) {
+ res.ArgType = StringSwitch<AMDGPULibFunc::EType>
+ (eatLengthPrefixedName(param))
+ .Case("ocl_image1darray" , AMDGPULibFunc::IMG1DA)
+ .Case("ocl_image1dbuffer", AMDGPULibFunc::IMG1DB)
+ .Case("ocl_image2darray" , AMDGPULibFunc::IMG2DA)
+ .Case("ocl_image1d" , AMDGPULibFunc::IMG1D)
+ .Case("ocl_image2d" , AMDGPULibFunc::IMG2D)
+ .Case("ocl_image3d" , AMDGPULibFunc::IMG3D)
+ .Case("ocl_event" , AMDGPULibFunc::DUMMY)
+ .Case("ocl_sampler" , AMDGPULibFunc::DUMMY)
+ .Default(AMDGPULibFunc::DUMMY);
+ } else {
+ drop_front(param);
+ switch (TC) {
+ case 'h': res.ArgType = AMDGPULibFunc::U8; break;
+ case 't': res.ArgType = AMDGPULibFunc::U16; break;
+ case 'j': res.ArgType = AMDGPULibFunc::U32; break;
+ case 'm': res.ArgType = AMDGPULibFunc::U64; break;
+ case 'c': res.ArgType = AMDGPULibFunc::I8; break;
+ case 's': res.ArgType = AMDGPULibFunc::I16; break;
+ case 'i': res.ArgType = AMDGPULibFunc::I32; break;
+ case 'l': res.ArgType = AMDGPULibFunc::I64; break;
+ case 'f': res.ArgType = AMDGPULibFunc::F32; break;
+ case 'd': res.ArgType = AMDGPULibFunc::F64; break;
+ case 'D': if (!eatTerm(param, 'h')) return false;
+ res.ArgType = AMDGPULibFunc::F16; break;
+ case 'S':
+ if (!eatTerm(param, '_')) {
+ eatNumber(param);
+ if (!eatTerm(param, '_')) return false;
+ }
+ res.VectorSize = Prev.VectorSize;
+ res.ArgType = Prev.ArgType;
+ break;
+ default:;
+ }
+ }
+ if (res.ArgType == 0) return false;
+ Prev.VectorSize = res.VectorSize;
+ Prev.ArgType = res.ArgType;
+ return true;
+}
+
+bool AMDGPUMangledLibFunc::parseFuncName(StringRef &mangledName) {
+ StringRef Name = eatLengthPrefixedName(mangledName);
+ FKind = parseNamePrefix(Name);
+ if (!parseUnmangledName(Name))
+ return false;
+
+ const ManglingRule& Rule = manglingRules[FuncId];
+ ItaniumParamParser Parser;
+ for (int I=0; I < Rule.maxLeadIndex(); ++I) {
+ Param P;
+ if (!Parser.parseItaniumParam(mangledName, P))
+ return false;
+
+ if ((I + 1) == Rule.Lead[0]) Leads[0] = P;
+ if ((I + 1) == Rule.Lead[1]) Leads[1] = P;
+ }
+ return true;
+}
+
+bool AMDGPUUnmangledLibFunc::parseFuncName(StringRef &Name) {
+ if (!UnmangledFuncInfo::lookup(Name, FuncId))
+ return false;
+ setName(Name);
+ return true;
+}
+
+bool AMDGPULibFunc::parse(StringRef FuncName, AMDGPULibFunc &F) {
+ if (FuncName.empty()) {
+ F.Impl = std::unique_ptr<AMDGPULibFuncImpl>();
+ return false;
+ }
+
+ if (eatTerm(FuncName, "_Z"))
+ F.Impl = make_unique<AMDGPUMangledLibFunc>();
+ else
+ F.Impl = make_unique<AMDGPUUnmangledLibFunc>();
+ if (F.Impl->parseFuncName(FuncName))
+ return true;
+
+ F.Impl = std::unique_ptr<AMDGPULibFuncImpl>();
+ return false;
+}
+
+StringRef AMDGPUMangledLibFunc::getUnmangledName(StringRef mangledName) {
+ StringRef S = mangledName;
+ if (eatTerm(S, "_Z"))
+ return eatLengthPrefixedName(S);
+ return StringRef();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Mangling
+
+template <typename Stream>
+void AMDGPUMangledLibFunc::writeName(Stream &OS) const {
+ const char *Pfx = "";
+ switch (FKind) {
+ case NATIVE: Pfx = "native_"; break;
+ case HALF: Pfx = "half_"; break;
+ default: break;
+ }
+ if (!Name.empty()) {
+ OS << Pfx << Name;
+ } else if (FuncId != EI_NONE) {
+ OS << Pfx;
+ const StringRef& S = manglingRules[FuncId].Name;
+ OS.write(S.data(), S.size());
+ }
+}
+
+std::string AMDGPUMangledLibFunc::mangle() const { return mangleNameItanium(); }
+
+///////////////////////////////////////////////////////////////////////////////
+// Itanium Mangling
+
+static const char *getItaniumTypeName(AMDGPULibFunc::EType T) {
+ switch (T) {
+ case AMDGPULibFunc::U8: return "h";
+ case AMDGPULibFunc::U16: return "t";
+ case AMDGPULibFunc::U32: return "j";
+ case AMDGPULibFunc::U64: return "m";
+ case AMDGPULibFunc::I8: return "c";
+ case AMDGPULibFunc::I16: return "s";
+ case AMDGPULibFunc::I32: return "i";
+ case AMDGPULibFunc::I64: return "l";
+ case AMDGPULibFunc::F16: return "Dh";
+ case AMDGPULibFunc::F32: return "f";
+ case AMDGPULibFunc::F64: return "d";
+ case AMDGPULibFunc::IMG1DA: return "16ocl_image1darray";
+ case AMDGPULibFunc::IMG1DB: return "17ocl_image1dbuffer";
+ case AMDGPULibFunc::IMG2DA: return "16ocl_image2darray";
+ case AMDGPULibFunc::IMG1D: return "11ocl_image1d";
+ case AMDGPULibFunc::IMG2D: return "11ocl_image2d";
+ case AMDGPULibFunc::IMG3D: return "11ocl_image3d";
+ case AMDGPULibFunc::SAMPLER: return "11ocl_sampler";
+ case AMDGPULibFunc::EVENT: return "9ocl_event";
+ default: llvm_unreachable("Unhandeled param type");
+ }
+ return nullptr;
+}
+
+namespace {
+// Itanium mangling ABI says:
+// "5.1.8. Compression
+// ... Each non-terminal in the grammar for which <substitution> appears on the
+// right-hand side is both a source of future substitutions and a candidate
+// for being substituted. There are two exceptions that appear to be
+// substitution candidates from the grammar, but are explicitly excluded:
+// 1. <builtin-type> other than vendor extended types ..."
+
+// For the purpose of functions the following productions make sence for the
+// substitution:
+// <type> ::= <builtin-type>
+// ::= <class-enum-type>
+// ::= <array-type>
+// ::=<CV-qualifiers> <type>
+// ::= P <type> # pointer-to
+// ::= <substitution>
+//
+// Note that while types like images, samplers and events are by the ABI encoded
+// using <class-enum-type> production rule they're not used for substitution
+// because clang consider them as builtin types.
+//
+// DvNN_ type is GCC extension for vectors and is a subject for the substitution.
+
+
+class ItaniumMangler {
+ SmallVector<AMDGPULibFunc::Param, 10> Str; // list of accumulated substituions
+ bool UseAddrSpace;
+
+ int findSubst(const AMDGPULibFunc::Param& P) const {
+ for(unsigned I = 0; I < Str.size(); ++I) {
+ const AMDGPULibFunc::Param& T = Str[I];
+ if (P.PtrKind == T.PtrKind &&
+ P.VectorSize == T.VectorSize &&
+ P.ArgType == T.ArgType) {
+ return I;
+ }
+ }
+ return -1;
+ }
+
+ template <typename Stream>
+ bool trySubst(Stream& os, const AMDGPULibFunc::Param& p) {
+ int const subst = findSubst(p);
+ if (subst < 0) return false;
+ // Substitutions are mangled as S(XX)?_ where XX is a hexadecimal number
+ // 0 1 2
+ // S_ S0_ S1_
+ if (subst == 0) os << "S_";
+ else os << 'S' << (subst-1) << '_';
+ return true;
+ }
+
+public:
+ ItaniumMangler(bool useAddrSpace)
+ : UseAddrSpace(useAddrSpace) {}
+
+ template <typename Stream>
+ void operator()(Stream& os, AMDGPULibFunc::Param p) {
+
+ // Itanium mangling ABI 5.1.8. Compression:
+ // Logically, the substitutable components of a mangled name are considered
+ // left-to-right, components before the composite structure of which they
+ // are a part. If a component has been encountered before, it is substituted
+ // as described below. This decision is independent of whether its components
+ // have been substituted, so an implementation may optimize by considering
+ // large structures for substitution before their components. If a component
+ // has not been encountered before, its mangling is identified, and it is
+ // added to a dictionary of substitution candidates. No entity is added to
+ // the dictionary twice.
+ AMDGPULibFunc::Param Ptr;
+
+ if (p.PtrKind) {
+ if (trySubst(os, p)) return;
+ os << 'P';
+ if (p.PtrKind & AMDGPULibFunc::CONST) os << 'K';
+ if (p.PtrKind & AMDGPULibFunc::VOLATILE) os << 'V';
+ unsigned AS = UseAddrSpace
+ ? AMDGPULibFuncBase::getAddrSpaceFromEPtrKind(p.PtrKind)
+ : 0;
+ if (AS != 0) os << "U3AS" << AS;
+ Ptr = p;
+ p.PtrKind = 0;
+ }
+
+ if (p.VectorSize > 1) {
+ if (trySubst(os, p)) goto exit;
+ Str.push_back(p);
+ os << "Dv" << static_cast<unsigned>(p.VectorSize) << '_';
+ }
+
+ os << getItaniumTypeName((AMDGPULibFunc::EType)p.ArgType);
+
+ exit:
+ if (Ptr.ArgType) Str.push_back(Ptr);
+ }
+};
+} // namespace
+
+std::string AMDGPUMangledLibFunc::mangleNameItanium() const {
+ SmallString<128> Buf;
+ raw_svector_ostream S(Buf);
+ SmallString<128> NameBuf;
+ raw_svector_ostream Name(NameBuf);
+ writeName(Name);
+ const StringRef& NameStr = Name.str();
+ S << "_Z" << static_cast<int>(NameStr.size()) << NameStr;
+
+ ItaniumMangler Mangler(true);
+ ParamIterator I(Leads, manglingRules[FuncId]);
+ Param P;
+ while ((P = I.getNextParam()).ArgType != 0)
+ Mangler(S, P);
+ return S.str();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Misc
+
+static Type* getIntrinsicParamType(
+ LLVMContext& C,
+ const AMDGPULibFunc::Param& P,
+ bool useAddrSpace) {
+ Type* T = nullptr;
+ switch (P.ArgType) {
+ case AMDGPULibFunc::U8:
+ case AMDGPULibFunc::I8: T = Type::getInt8Ty(C); break;
+ case AMDGPULibFunc::U16:
+ case AMDGPULibFunc::I16: T = Type::getInt16Ty(C); break;
+ case AMDGPULibFunc::U32:
+ case AMDGPULibFunc::I32: T = Type::getInt32Ty(C); break;
+ case AMDGPULibFunc::U64:
+ case AMDGPULibFunc::I64: T = Type::getInt64Ty(C); break;
+ case AMDGPULibFunc::F16: T = Type::getHalfTy(C); break;
+ case AMDGPULibFunc::F32: T = Type::getFloatTy(C); break;
+ case AMDGPULibFunc::F64: T = Type::getDoubleTy(C); break;
+
+ case AMDGPULibFunc::IMG1DA:
+ case AMDGPULibFunc::IMG1DB:
+ case AMDGPULibFunc::IMG2DA:
+ case AMDGPULibFunc::IMG1D:
+ case AMDGPULibFunc::IMG2D:
+ case AMDGPULibFunc::IMG3D:
+ T = StructType::create(C,"ocl_image")->getPointerTo(); break;
+ case AMDGPULibFunc::SAMPLER:
+ T = StructType::create(C,"ocl_sampler")->getPointerTo(); break;
+ case AMDGPULibFunc::EVENT:
+ T = StructType::create(C,"ocl_event")->getPointerTo(); break;
+ default:
+ llvm_unreachable("Unhandeled param type");
+ return nullptr;
+ }
+ if (P.VectorSize > 1)
+ T = VectorType::get(T, P.VectorSize);
+ if (P.PtrKind != AMDGPULibFunc::BYVALUE)
+ T = useAddrSpace ? T->getPointerTo((P.PtrKind & AMDGPULibFunc::ADDR_SPACE)
+ - 1)
+ : T->getPointerTo();
+ return T;
+}
+
+FunctionType *AMDGPUMangledLibFunc::getFunctionType(Module &M) const {
+ LLVMContext& C = M.getContext();
+ std::vector<Type*> Args;
+ ParamIterator I(Leads, manglingRules[FuncId]);
+ Param P;
+ while ((P=I.getNextParam()).ArgType != 0)
+ Args.push_back(getIntrinsicParamType(C, P, true));
+
+ return FunctionType::get(
+ getIntrinsicParamType(C, getRetType(FuncId, Leads), true),
+ Args, false);
+}
+
+unsigned AMDGPUMangledLibFunc::getNumArgs() const {
+ return manglingRules[FuncId].getNumArgs();
+}
+
+unsigned AMDGPUUnmangledLibFunc::getNumArgs() const {
+ return UnmangledFuncInfo::getNumArgs(FuncId);
+}
+
+std::string AMDGPUMangledLibFunc::getName() const {
+ SmallString<128> Buf;
+ raw_svector_ostream OS(Buf);
+ writeName(OS);
+ return OS.str();
+}
+
+Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc &fInfo) {
+ std::string FuncName = fInfo.mangle();
+ Function *F = dyn_cast_or_null<Function>(
+ M->getValueSymbolTable().lookup(FuncName));
+
+ // check formal with actual types conformance
+ if (F && !F->isDeclaration()
+ && !F->isVarArg()
+ && F->arg_size() == fInfo.getNumArgs()) {
+ return F;
+ }
+ return nullptr;
+}
+
+Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
+ const AMDGPULibFunc &fInfo) {
+ std::string const FuncName = fInfo.mangle();
+ Function *F = dyn_cast_or_null<Function>(
+ M->getValueSymbolTable().lookup(FuncName));
+
+ // check formal with actual types conformance
+ if (F && !F->isDeclaration()
+ && !F->isVarArg()
+ && F->arg_size() == fInfo.getNumArgs()) {
+ return F;
+ }
+
+ FunctionType *FuncTy = fInfo.getFunctionType(*M);
+
+ bool hasPtr = false;
+ for (FunctionType::param_iterator
+ PI = FuncTy->param_begin(),
+ PE = FuncTy->param_end();
+ PI != PE; ++PI) {
+ const Type* argTy = static_cast<const Type*>(*PI);
+ if (argTy->isPointerTy()) {
+ hasPtr = true;
+ break;
+ }
+ }
+
+ Constant *C = nullptr;
+ if (hasPtr) {
+ // Do not set extra attributes for functions with pointer arguments.
+ C = M->getOrInsertFunction(FuncName, FuncTy);
+ } else {
+ AttributeList Attr;
+ LLVMContext &Ctx = M->getContext();
+ Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::ReadOnly);
+ Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::NoUnwind);
+ C = M->getOrInsertFunction(FuncName, FuncTy, Attr);
+ }
+
+ return cast<Function>(C);
+}
+
+bool UnmangledFuncInfo::lookup(StringRef Name, ID &Id) {
+ auto Loc = Map.find(Name);
+ if (Loc != Map.end()) {
+ Id = toFuncId(Loc->second);
+ return true;
+ }
+ Id = AMDGPULibFunc::EI_NONE;
+ return false;
+}
+
+AMDGPULibFunc::AMDGPULibFunc(const AMDGPULibFunc &F) {
+ if (auto *MF = dyn_cast<AMDGPUMangledLibFunc>(F.Impl.get()))
+ Impl.reset(new AMDGPUMangledLibFunc(*MF));
+ else if (auto *UMF = dyn_cast<AMDGPUUnmangledLibFunc>(F.Impl.get()))
+ Impl.reset(new AMDGPUUnmangledLibFunc(*UMF));
+ else
+ Impl = std::unique_ptr<AMDGPULibFuncImpl>();
+}
+
+AMDGPULibFunc &AMDGPULibFunc::operator=(const AMDGPULibFunc &F) {
+ if (this == &F)
+ return *this;
+ new (this) AMDGPULibFunc(F);
+ return *this;
+}
+
+AMDGPULibFunc::AMDGPULibFunc(EFuncId Id, const AMDGPULibFunc &CopyFrom) {
+ assert(AMDGPULibFuncBase::isMangled(Id) && CopyFrom.isMangled() &&
+ "not supported");
+ Impl.reset(new AMDGPUMangledLibFunc(
+ Id, *cast<AMDGPUMangledLibFunc>(CopyFrom.Impl.get())));
+}
+
+AMDGPULibFunc::AMDGPULibFunc(StringRef Name, FunctionType *FT) {
+ Impl.reset(new AMDGPUUnmangledLibFunc(Name, FT));
+}
+
+void AMDGPULibFunc::initMangled() { Impl.reset(new AMDGPUMangledLibFunc()); }
+
+AMDGPULibFunc::Param *AMDGPULibFunc::getLeads() {
+ if (!Impl)
+ initMangled();
+ return cast<AMDGPUMangledLibFunc>(Impl.get())->Leads;
+}
+
+const AMDGPULibFunc::Param *AMDGPULibFunc::getLeads() const {
+ return cast<const AMDGPUMangledLibFunc>(Impl.get())->Leads;
+}
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.h b/lib/Target/AMDGPU/AMDGPULibFunc.h
new file mode 100644
index 000000000000..5405bc645714
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -0,0 +1,459 @@
+//===-- AMDGPULibFunc.h ---------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _AMDGPU_LIBFUNC_H_
+#define _AMDGPU_LIBFUNC_H_
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+class FunctionType;
+class Function;
+class Module;
+
+class AMDGPULibFuncBase {
+public:
+ enum EFuncId {
+ EI_NONE,
+
+ // IMPORTANT: enums below should go in ascending by 1 value order
+ // because they are used as indexes in the mangling rules table.
+ // don't use explicit value assignment.
+ //
+ // There are two types of library functions: those with mangled
+ // name and those with unmangled name. The enums for the library
+ // functions with mangled name are defined before enums for the
+ // library functions with unmangled name. The enum for the last
+ // library function with mangled name is EI_LAST_MANGLED.
+ //
+ // Library functions with mangled name.
+ EI_ABS,
+ EI_ABS_DIFF,
+ EI_ACOS,
+ EI_ACOSH,
+ EI_ACOSPI,
+ EI_ADD_SAT,
+ EI_ALL,
+ EI_ANY,
+ EI_ASIN,
+ EI_ASINH,
+ EI_ASINPI,
+ EI_ASYNC_WORK_GROUP_COPY,
+ EI_ASYNC_WORK_GROUP_STRIDED_COPY,
+ EI_ATAN,
+ EI_ATAN2,
+ EI_ATAN2PI,
+ EI_ATANH,
+ EI_ATANPI,
+ EI_ATOMIC_ADD,
+ EI_ATOMIC_AND,
+ EI_ATOMIC_CMPXCHG,
+ EI_ATOMIC_DEC,
+ EI_ATOMIC_INC,
+ EI_ATOMIC_MAX,
+ EI_ATOMIC_MIN,
+ EI_ATOMIC_OR,
+ EI_ATOMIC_SUB,
+ EI_ATOMIC_XCHG,
+ EI_ATOMIC_XOR,
+ EI_BITSELECT,
+ EI_CBRT,
+ EI_CEIL,
+ EI_CLAMP,
+ EI_CLZ,
+ EI_COMMIT_READ_PIPE,
+ EI_COMMIT_WRITE_PIPE,
+ EI_COPYSIGN,
+ EI_COS,
+ EI_COSH,
+ EI_COSPI,
+ EI_CROSS,
+ EI_CTZ,
+ EI_DEGREES,
+ EI_DISTANCE,
+ EI_DIVIDE,
+ EI_DOT,
+ EI_ERF,
+ EI_ERFC,
+ EI_EXP,
+ EI_EXP10,
+ EI_EXP2,
+ EI_EXPM1,
+ EI_FABS,
+ EI_FAST_DISTANCE,
+ EI_FAST_LENGTH,
+ EI_FAST_NORMALIZE,
+ EI_FDIM,
+ EI_FLOOR,
+ EI_FMA,
+ EI_FMAX,
+ EI_FMIN,
+ EI_FMOD,
+ EI_FRACT,
+ EI_FREXP,
+ EI_GET_IMAGE_ARRAY_SIZE,
+ EI_GET_IMAGE_CHANNEL_DATA_TYPE,
+ EI_GET_IMAGE_CHANNEL_ORDER,
+ EI_GET_IMAGE_DIM,
+ EI_GET_IMAGE_HEIGHT,
+ EI_GET_IMAGE_WIDTH,
+ EI_GET_PIPE_MAX_PACKETS,
+ EI_GET_PIPE_NUM_PACKETS,
+ EI_HADD,
+ EI_HYPOT,
+ EI_ILOGB,
+ EI_ISEQUAL,
+ EI_ISFINITE,
+ EI_ISGREATER,
+ EI_ISGREATEREQUAL,
+ EI_ISINF,
+ EI_ISLESS,
+ EI_ISLESSEQUAL,
+ EI_ISLESSGREATER,
+ EI_ISNAN,
+ EI_ISNORMAL,
+ EI_ISNOTEQUAL,
+ EI_ISORDERED,
+ EI_ISUNORDERED,
+ EI_LDEXP,
+ EI_LENGTH,
+ EI_LGAMMA,
+ EI_LGAMMA_R,
+ EI_LOG,
+ EI_LOG10,
+ EI_LOG1P,
+ EI_LOG2,
+ EI_LOGB,
+ EI_MAD,
+ EI_MAD24,
+ EI_MAD_HI,
+ EI_MAD_SAT,
+ EI_MAX,
+ EI_MAXMAG,
+ EI_MIN,
+ EI_MINMAG,
+ EI_MIX,
+ EI_MODF,
+ EI_MUL24,
+ EI_MUL_HI,
+ EI_NAN,
+ EI_NEXTAFTER,
+ EI_NORMALIZE,
+ EI_POPCOUNT,
+ EI_POW,
+ EI_POWN,
+ EI_POWR,
+ EI_PREFETCH,
+ EI_RADIANS,
+ EI_RECIP,
+ EI_REMAINDER,
+ EI_REMQUO,
+ EI_RESERVE_READ_PIPE,
+ EI_RESERVE_WRITE_PIPE,
+ EI_RHADD,
+ EI_RINT,
+ EI_ROOTN,
+ EI_ROTATE,
+ EI_ROUND,
+ EI_RSQRT,
+ EI_SELECT,
+ EI_SHUFFLE,
+ EI_SHUFFLE2,
+ EI_SIGN,
+ EI_SIGNBIT,
+ EI_SIN,
+ EI_SINCOS,
+ EI_SINH,
+ EI_SINPI,
+ EI_SMOOTHSTEP,
+ EI_SQRT,
+ EI_STEP,
+ EI_SUB_GROUP_BROADCAST,
+ EI_SUB_GROUP_COMMIT_READ_PIPE,
+ EI_SUB_GROUP_COMMIT_WRITE_PIPE,
+ EI_SUB_GROUP_REDUCE_ADD,
+ EI_SUB_GROUP_REDUCE_MAX,
+ EI_SUB_GROUP_REDUCE_MIN,
+ EI_SUB_GROUP_RESERVE_READ_PIPE,
+ EI_SUB_GROUP_RESERVE_WRITE_PIPE,
+ EI_SUB_GROUP_SCAN_EXCLUSIVE_ADD,
+ EI_SUB_GROUP_SCAN_EXCLUSIVE_MAX,
+ EI_SUB_GROUP_SCAN_EXCLUSIVE_MIN,
+ EI_SUB_GROUP_SCAN_INCLUSIVE_ADD,
+ EI_SUB_GROUP_SCAN_INCLUSIVE_MAX,
+ EI_SUB_GROUP_SCAN_INCLUSIVE_MIN,
+ EI_SUB_SAT,
+ EI_TAN,
+ EI_TANH,
+ EI_TANPI,
+ EI_TGAMMA,
+ EI_TRUNC,
+ EI_UPSAMPLE,
+ EI_VEC_STEP,
+ EI_VSTORE,
+ EI_VSTORE16,
+ EI_VSTORE2,
+ EI_VSTORE3,
+ EI_VSTORE4,
+ EI_VSTORE8,
+ EI_WORK_GROUP_COMMIT_READ_PIPE,
+ EI_WORK_GROUP_COMMIT_WRITE_PIPE,
+ EI_WORK_GROUP_REDUCE_ADD,
+ EI_WORK_GROUP_REDUCE_MAX,
+ EI_WORK_GROUP_REDUCE_MIN,
+ EI_WORK_GROUP_RESERVE_READ_PIPE,
+ EI_WORK_GROUP_RESERVE_WRITE_PIPE,
+ EI_WORK_GROUP_SCAN_EXCLUSIVE_ADD,
+ EI_WORK_GROUP_SCAN_EXCLUSIVE_MAX,
+ EI_WORK_GROUP_SCAN_EXCLUSIVE_MIN,
+ EI_WORK_GROUP_SCAN_INCLUSIVE_ADD,
+ EI_WORK_GROUP_SCAN_INCLUSIVE_MAX,
+ EI_WORK_GROUP_SCAN_INCLUSIVE_MIN,
+ EI_WRITE_IMAGEF,
+ EI_WRITE_IMAGEI,
+ EI_WRITE_IMAGEUI,
+ EI_NCOS,
+ EI_NEXP2,
+ EI_NFMA,
+ EI_NLOG2,
+ EI_NRCP,
+ EI_NRSQRT,
+ EI_NSIN,
+ EI_NSQRT,
+ EI_FTZ,
+ EI_FLDEXP,
+ EI_CLASS,
+ EI_RCBRT,
+ EI_LAST_MANGLED =
+ EI_RCBRT, /* The last library function with mangled name */
+
+ // Library functions with unmangled name.
+ EI_READ_PIPE_2,
+ EI_READ_PIPE_4,
+ EI_WRITE_PIPE_2,
+ EI_WRITE_PIPE_4,
+
+ EX_INTRINSICS_COUNT
+ };
+
+ enum ENamePrefix {
+ NOPFX,
+ NATIVE,
+ HALF
+ };
+
+ enum EType {
+ B8 = 1,
+ B16 = 2,
+ B32 = 3,
+ B64 = 4,
+ SIZE_MASK = 7,
+ FLOAT = 0x10,
+ INT = 0x20,
+ UINT = 0x30,
+ BASE_TYPE_MASK = 0x30,
+ U8 = UINT | B8,
+ U16 = UINT | B16,
+ U32 = UINT | B32,
+ U64 = UINT | B64,
+ I8 = INT | B8,
+ I16 = INT | B16,
+ I32 = INT | B32,
+ I64 = INT | B64,
+ F16 = FLOAT | B16,
+ F32 = FLOAT | B32,
+ F64 = FLOAT | B64,
+ IMG1DA = 0x80,
+ IMG1DB,
+ IMG2DA,
+ IMG1D,
+ IMG2D,
+ IMG3D,
+ SAMPLER,
+ EVENT,
+ DUMMY
+ };
+
+ enum EPtrKind {
+ BYVALUE = 0,
+ ADDR_SPACE = 0xF, // Address space takes value 0x1 ~ 0xF.
+ CONST = 0x10,
+ VOLATILE = 0x20
+ };
+
+ struct Param {
+ unsigned char ArgType;
+ unsigned char VectorSize;
+ unsigned char PtrKind;
+
+ unsigned char Reserved;
+
+ void reset() {
+ ArgType = 0;
+ VectorSize = 1;
+ PtrKind = 0;
+ }
+ Param() { reset(); }
+
+ template <typename Stream>
+ void mangleItanium(Stream& os);
+ };
+ static bool isMangled(EFuncId Id) {
+ return static_cast<unsigned>(Id) <= static_cast<unsigned>(EI_LAST_MANGLED);
+ }
+
+ static unsigned getEPtrKindFromAddrSpace(unsigned AS) {
+ assert(((AS + 1) & ~ADDR_SPACE) == 0);
+ return AS + 1;
+ }
+
+ static unsigned getAddrSpaceFromEPtrKind(unsigned Kind) {
+ Kind = Kind & ADDR_SPACE;
+ assert(Kind >= 1);
+ return Kind - 1;
+ }
+};
+
+class AMDGPULibFuncImpl : public AMDGPULibFuncBase {
+public:
+ AMDGPULibFuncImpl() {}
+ virtual ~AMDGPULibFuncImpl() {}
+
+ /// Get unmangled name for mangled library function and name for unmangled
+ /// library function.
+ virtual std::string getName() const = 0;
+ virtual unsigned getNumArgs() const = 0;
+ EFuncId getId() const { return FuncId; }
+ ENamePrefix getPrefix() const { return FKind; }
+
+ bool isMangled() const { return AMDGPULibFuncBase::isMangled(FuncId); }
+
+ void setId(EFuncId id) { FuncId = id; }
+ virtual bool parseFuncName(StringRef &mangledName) = 0;
+
+ /// \return The mangled function name for mangled library functions
+ /// and unmangled function name for unmangled library functions.
+ virtual std::string mangle() const = 0;
+
+ void setName(StringRef N) { Name = N; }
+ void setPrefix(ENamePrefix pfx) { FKind = pfx; }
+
+ virtual FunctionType *getFunctionType(Module &M) const = 0;
+
+protected:
+ EFuncId FuncId;
+ std::string Name;
+ ENamePrefix FKind;
+};
+
+/// Wrapper class for AMDGPULIbFuncImpl
+class AMDGPULibFunc : public AMDGPULibFuncBase {
+public:
+ explicit AMDGPULibFunc() : Impl(std::unique_ptr<AMDGPULibFuncImpl>()) {}
+ AMDGPULibFunc(const AMDGPULibFunc &F);
+ /// Clone a mangled library func with the Id \p Id and argument info from \p
+ /// CopyFrom.
+ explicit AMDGPULibFunc(EFuncId Id, const AMDGPULibFunc &CopyFrom);
+ /// Construct an unmangled library function on the fly.
+ explicit AMDGPULibFunc(StringRef FName, FunctionType *FT);
+
+ AMDGPULibFunc &operator=(const AMDGPULibFunc &F);
+
+ /// Get unmangled name for mangled library function and name for unmangled
+ /// library function.
+ std::string getName() const { return Impl->getName(); }
+ unsigned getNumArgs() const { return Impl->getNumArgs(); }
+ EFuncId getId() const { return Impl->getId(); }
+ ENamePrefix getPrefix() const { return Impl->getPrefix(); }
+ /// Get leading parameters for mangled lib functions.
+ Param *getLeads();
+ const Param *getLeads() const;
+
+ bool isMangled() const { return Impl->isMangled(); }
+ void setId(EFuncId Id) { Impl->setId(Id); }
+ bool parseFuncName(StringRef &MangledName) {
+ return Impl->parseFuncName(MangledName);
+ }
+
+ /// \return The mangled function name for mangled library functions
+ /// and unmangled function name for unmangled library functions.
+ std::string mangle() const { return Impl->mangle(); }
+
+ void setName(StringRef N) { Impl->setName(N); }
+ void setPrefix(ENamePrefix PFX) { Impl->setPrefix(PFX); }
+
+ FunctionType *getFunctionType(Module &M) const {
+ return Impl->getFunctionType(M);
+ }
+ static Function *getFunction(llvm::Module *M, const AMDGPULibFunc &fInfo);
+
+ static Function *getOrInsertFunction(llvm::Module *M,
+ const AMDGPULibFunc &fInfo);
+ static bool parse(StringRef MangledName, AMDGPULibFunc &Ptr);
+
+private:
+ /// Initialize as a mangled library function.
+ void initMangled();
+ std::unique_ptr<AMDGPULibFuncImpl> Impl;
+};
+
+class AMDGPUMangledLibFunc : public AMDGPULibFuncImpl {
+public:
+ Param Leads[2];
+
+ explicit AMDGPUMangledLibFunc();
+ explicit AMDGPUMangledLibFunc(EFuncId id,
+ const AMDGPUMangledLibFunc &copyFrom);
+
+ std::string getName() const override;
+ unsigned getNumArgs() const override;
+ FunctionType *getFunctionType(Module &M) const override;
+ static StringRef getUnmangledName(StringRef MangledName);
+
+ bool parseFuncName(StringRef &mangledName) override;
+
+ // Methods for support type inquiry through isa, cast, and dyn_cast:
+ static bool classof(const AMDGPULibFuncImpl *F) { return F->isMangled(); }
+
+ std::string mangle() const override;
+
+private:
+ std::string mangleNameItanium() const;
+
+ std::string mangleName(StringRef Name) const;
+ bool parseUnmangledName(StringRef MangledName);
+
+ template <typename Stream> void writeName(Stream &OS) const;
+};
+
+class AMDGPUUnmangledLibFunc : public AMDGPULibFuncImpl {
+ FunctionType *FuncTy;
+
+public:
+ explicit AMDGPUUnmangledLibFunc();
+ explicit AMDGPUUnmangledLibFunc(StringRef FName, FunctionType *FT) {
+ Name = FName;
+ FuncTy = FT;
+ }
+ std::string getName() const override { return Name; }
+ unsigned getNumArgs() const override;
+ FunctionType *getFunctionType(Module &M) const override { return FuncTy; }
+
+ bool parseFuncName(StringRef &Name) override;
+
+ // Methods for support type inquiry through isa, cast, and dyn_cast:
+ static bool classof(const AMDGPULibFuncImpl *F) { return !F->isMangled(); }
+
+ std::string mangle() const override { return Name; }
+
+ void setFunctionType(FunctionType *FT) { FuncTy = FT; }
+};
+}
+#endif // _AMDGPU_LIBFUNC_H_
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 63dd0d726d91..23fd8113932c 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -121,21 +121,39 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
MCOp = MCOperand::createExpr(Expr);
return true;
}
+ case MachineOperand::MO_RegisterMask:
+ // Regmasks are like implicit defs.
+ return false;
}
}
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
unsigned Opcode = MI->getOpcode();
+ const auto *TII = ST.getInstrInfo();
// FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
// need to select it to the subtarget specific version, and there's no way to
// do that with a single pseudo source operation.
if (Opcode == AMDGPU::S_SETPC_B64_return)
Opcode = AMDGPU::S_SETPC_B64;
+ else if (Opcode == AMDGPU::SI_CALL) {
+ // SI_CALL is just S_SWAPPC_B64 with an additional operand to track the
+ // called function (which we need to remove here).
+ OutMI.setOpcode(TII->pseudoToMCOpcode(AMDGPU::S_SWAPPC_B64));
+ MCOperand Dest, Src;
+ lowerOperand(MI->getOperand(0), Dest);
+ lowerOperand(MI->getOperand(1), Src);
+ OutMI.addOperand(Dest);
+ OutMI.addOperand(Src);
+ return;
+ } else if (Opcode == AMDGPU::SI_TCRETURN) {
+ // TODO: How to use branch immediate and avoid register+add?
+ Opcode = AMDGPU::S_SETPC_B64;
+ }
- int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode);
+ int MCOpcode = TII->pseudoToMCOpcode(Opcode);
if (MCOpcode == -1) {
- LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
+ LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
"a target-specific version: " + Twine(MI->getOpcode()));
}
@@ -187,7 +205,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
StringRef Err;
if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
- LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
+ LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
C.emitError("Illegal instruction detected: " + Err);
MI->print(errs());
}
@@ -212,7 +230,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
const MCSymbolRefExpr *Expr
= MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
Expr->print(Str, MAI);
- OutStreamer->emitRawComment(" mask branch " + BBStr);
+ OutStreamer->emitRawComment(Twine(" mask branch ") + BBStr);
}
return;
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 9a391d06c9ea..20918233e447 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -14,46 +14,55 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/CFG.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegionInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
#include <tuple>
+#include <utility>
+
using namespace llvm;
#define DEBUG_TYPE "amdgpucfgstructurizer"
namespace {
+
class PHILinearizeDestIterator;
class PHILinearize {
friend class PHILinearizeDestIterator;
public:
- typedef std::pair<unsigned, MachineBasicBlock *> PHISourceT;
+ using PHISourceT = std::pair<unsigned, MachineBasicBlock *>;
private:
- typedef DenseSet<PHISourceT> PHISourcesT;
- typedef struct {
+ using PHISourcesT = DenseSet<PHISourceT>;
+ using PHIInfoElementT = struct {
unsigned DestReg;
DebugLoc DL;
PHISourcesT Sources;
- } PHIInfoElementT;
- typedef SmallPtrSet<PHIInfoElementT *, 2> PHIInfoT;
+ };
+ using PHIInfoT = SmallPtrSet<PHIInfoElementT *, 2>;
PHIInfoT PHIInfo;
static unsigned phiInfoElementGetDest(PHIInfoElementT *Info);
@@ -85,8 +94,8 @@ public:
void dump(MachineRegisterInfo *MRI);
void clear();
- typedef PHISourcesT::iterator source_iterator;
- typedef PHILinearizeDestIterator dest_iterator;
+ using source_iterator = PHISourcesT::iterator;
+ using dest_iterator = PHILinearizeDestIterator;
dest_iterator dests_begin();
dest_iterator dests_end();
@@ -100,6 +109,8 @@ private:
PHILinearize::PHIInfoT::iterator Iter;
public:
+ PHILinearizeDestIterator(PHILinearize::PHIInfoT::iterator I) : Iter(I) {}
+
unsigned operator*() { return PHILinearize::phiInfoElementGetDest(*Iter); }
PHILinearizeDestIterator &operator++() {
++Iter;
@@ -111,10 +122,10 @@ public:
bool operator!=(const PHILinearizeDestIterator &I) const {
return I.Iter != Iter;
}
-
- PHILinearizeDestIterator(PHILinearize::PHIInfoT::iterator I) : Iter(I) {}
};
+} // end anonymous namespace
+
unsigned PHILinearize::phiInfoElementGetDest(PHIInfoElementT *Info) {
return Info->DestReg;
}
@@ -250,21 +261,23 @@ unsigned PHILinearize::getNumSources(unsigned DestReg) {
return phiInfoElementGetSources(findPHIInfoElement(DestReg)).size();
}
-void PHILinearize::dump(MachineRegisterInfo *MRI) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void PHILinearize::dump(MachineRegisterInfo *MRI) {
const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
dbgs() << "=PHIInfo Start=\n";
for (auto PII : this->PHIInfo) {
PHIInfoElementT &Element = *PII;
- dbgs() << "Dest: " << PrintReg(Element.DestReg, TRI)
+ dbgs() << "Dest: " << printReg(Element.DestReg, TRI)
<< " Sources: {";
for (auto &SI : Element.Sources) {
- dbgs() << PrintReg(SI.first, TRI) << "(BB#"
- << SI.second->getNumber() << "),";
+ dbgs() << printReg(SI.first, TRI) << '(' << printMBBReference(*SI.second)
+ << "),";
}
dbgs() << "}\n";
}
dbgs() << "=PHIInfo End=\n";
}
+#endif
void PHILinearize::clear() { PHIInfo = PHIInfoT(); }
@@ -280,14 +293,12 @@ PHILinearize::source_iterator PHILinearize::sources_begin(unsigned Reg) {
auto InfoElement = findPHIInfoElement(Reg);
return phiInfoElementGetSources(InfoElement).begin();
}
+
PHILinearize::source_iterator PHILinearize::sources_end(unsigned Reg) {
auto InfoElement = findPHIInfoElement(Reg);
return phiInfoElementGetSources(InfoElement).end();
}
-class RegionMRT;
-class MBBMRT;
-
static unsigned getPHINumInputs(MachineInstr &PHI) {
assert(PHI.isPHI());
return (PHI.getNumOperands() - 1) / 2;
@@ -313,6 +324,11 @@ static unsigned getPHIDestReg(MachineInstr &PHI) {
return PHI.getOperand(0).getReg();
}
+namespace {
+
+class RegionMRT;
+class MBBMRT;
+
class LinearizedRegion {
protected:
MachineBasicBlock *Entry;
@@ -347,6 +363,11 @@ protected:
RegionMRT *TopRegion = nullptr);
public:
+ LinearizedRegion();
+ LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+ ~LinearizedRegion() = default;
+
void setRegionMRT(RegionMRT *Region) { RMRT = Region; }
RegionMRT *getRegionMRT() { return RMRT; }
@@ -411,13 +432,6 @@ public:
void initLiveOut(RegionMRT *Region, const MachineRegisterInfo *MRI,
const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
-
- LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
- const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
-
- LinearizedRegion();
-
- ~LinearizedRegion();
};
class MRT {
@@ -427,6 +441,8 @@ protected:
unsigned BBSelectRegOut;
public:
+ virtual ~MRT() = default;
+
unsigned getBBSelectRegIn() { return BBSelectRegIn; }
unsigned getBBSelectRegOut() { return BBSelectRegOut; }
@@ -465,42 +481,55 @@ public:
dbgs() << " ";
}
}
-
- virtual ~MRT() {}
};
class MBBMRT : public MRT {
MachineBasicBlock *MBB;
public:
- virtual MBBMRT *getMBBMRT() { return this; }
+ MBBMRT(MachineBasicBlock *BB) : MBB(BB) {
+ setParent(nullptr);
+ setBBSelectRegOut(0);
+ setBBSelectRegIn(0);
+ }
+
+ MBBMRT *getMBBMRT() override { return this; }
MachineBasicBlock *getMBB() { return MBB; }
- virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) {
+ void dump(const TargetRegisterInfo *TRI, int depth = 0) override {
dumpDepth(depth);
dbgs() << "MBB: " << getMBB()->getNumber();
- dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI);
- dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n";
- }
-
- MBBMRT(MachineBasicBlock *BB) : MBB(BB) {
- setParent(nullptr);
- setBBSelectRegOut(0);
- setBBSelectRegIn(0);
+ dbgs() << " In: " << printReg(getBBSelectRegIn(), TRI);
+ dbgs() << ", Out: " << printReg(getBBSelectRegOut(), TRI) << "\n";
}
};
class RegionMRT : public MRT {
protected:
MachineRegion *Region;
- LinearizedRegion *LRegion;
- MachineBasicBlock *Succ;
-
+ LinearizedRegion *LRegion = nullptr;
+ MachineBasicBlock *Succ = nullptr;
SetVector<MRT *> Children;
public:
- virtual RegionMRT *getRegionMRT() { return this; }
+ RegionMRT(MachineRegion *MachineRegion) : Region(MachineRegion) {
+ setParent(nullptr);
+ setBBSelectRegOut(0);
+ setBBSelectRegIn(0);
+ }
+
+ ~RegionMRT() override {
+ if (LRegion) {
+ delete LRegion;
+ }
+
+ for (auto CI : Children) {
+ delete &(*CI);
+ }
+ }
+
+ RegionMRT *getRegionMRT() override { return this; }
void setLinearizedRegion(LinearizedRegion *LinearizeRegion) {
LRegion = LinearizeRegion;
@@ -518,11 +547,11 @@ public:
SetVector<MRT *> *getChildren() { return &Children; }
- virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) {
+ void dump(const TargetRegisterInfo *TRI, int depth = 0) override {
dumpDepth(depth);
dbgs() << "Region: " << (void *)Region;
- dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI);
- dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n";
+ dbgs() << " In: " << printReg(getBBSelectRegIn(), TRI);
+ dbgs() << ", Out: " << printReg(getBBSelectRegOut(), TRI) << "\n";
dumpDepth(depth);
if (getSucc())
@@ -581,25 +610,10 @@ public:
}
}
}
-
- RegionMRT(MachineRegion *MachineRegion)
- : Region(MachineRegion), LRegion(nullptr), Succ(nullptr) {
- setParent(nullptr);
- setBBSelectRegOut(0);
- setBBSelectRegIn(0);
- }
-
- virtual ~RegionMRT() {
- if (LRegion) {
- delete LRegion;
- }
-
- for (auto CI : Children) {
- delete &(*CI);
- }
- }
};
+} // end anonymous namespace
+
static unsigned createBBSelectReg(const SIInstrInfo *TII,
MachineRegisterInfo *MRI) {
return MRI->createVirtualRegister(TII->getPreferredSelectRegClass(32));
@@ -644,7 +658,7 @@ RegionMRT *MRT::buildMRT(MachineFunction &MF,
continue;
}
- DEBUG(dbgs() << "Visiting BB#" << MBB->getNumber() << "\n");
+ DEBUG(dbgs() << "Visiting " << printMBBReference(*MBB) << "\n");
MBBMRT *NewMBB = new MBBMRT(MBB);
MachineRegion *Region = RegionInfo->getRegionFor(MBB);
@@ -681,18 +695,18 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
if (TRI->isVirtualRegister(Reg)) {
- DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n");
+ DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n");
// If this is a source register to a PHI we are chaining, it
// must be live out.
if (PHIInfo.isSource(Reg)) {
- DEBUG(dbgs() << "Add LiveOut (PHI): " << PrintReg(Reg, TRI) << "\n");
+ DEBUG(dbgs() << "Add LiveOut (PHI): " << printReg(Reg, TRI) << "\n");
addLiveOut(Reg);
} else {
// If this is live out of the MBB
for (auto &UI : MRI->use_operands(Reg)) {
if (UI.getParent()->getParent() != MBB) {
- DEBUG(dbgs() << "Add LiveOut (MBB BB#" << MBB->getNumber()
- << "): " << PrintReg(Reg, TRI) << "\n");
+ DEBUG(dbgs() << "Add LiveOut (MBB " << printMBBReference(*MBB)
+ << "): " << printReg(Reg, TRI) << "\n");
addLiveOut(Reg);
} else {
// If the use is in the same MBB we have to make sure
@@ -703,7 +717,7 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
MIE = UseInstr->getParent()->instr_end();
MII != MIE; ++MII) {
if ((&(*MII)) == DefInstr) {
- DEBUG(dbgs() << "Add LiveOut (Loop): " << PrintReg(Reg, TRI)
+ DEBUG(dbgs() << "Add LiveOut (Loop): " << printReg(Reg, TRI)
<< "\n");
addLiveOut(Reg);
}
@@ -720,11 +734,11 @@ void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
if (TRI->isVirtualRegister(Reg)) {
- DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n");
+ DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n");
for (auto &UI : MRI->use_operands(Reg)) {
if (!Region->contains(UI.getParent()->getParent())) {
DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
- << "): " << PrintReg(Reg, TRI) << "\n");
+ << "): " << printReg(Reg, TRI) << "\n");
addLiveOut(Reg);
}
}
@@ -735,7 +749,8 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
const MachineRegisterInfo *MRI,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
- DEBUG(dbgs() << "-Store Live Outs Begin (BB#" << MBB->getNumber() << ")-\n");
+ DEBUG(dbgs() << "-Store Live Outs Begin (" << printMBBReference(*MBB)
+ << ")-\n");
for (auto &II : *MBB) {
for (auto &RI : II.defs()) {
storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo);
@@ -759,9 +774,9 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
for (int i = 0; i < numPreds; ++i) {
if (getPHIPred(PHI, i) == MBB) {
unsigned PHIReg = getPHISourceReg(PHI, i);
- DEBUG(dbgs() << "Add LiveOut (PhiSource BB#" << MBB->getNumber()
- << " -> BB#" << (*SI)->getNumber()
- << "): " << PrintReg(PHIReg, TRI) << "\n");
+ DEBUG(dbgs() << "Add LiveOut (PhiSource " << printMBBReference(*MBB)
+ << " -> " << printMBBReference(*(*SI))
+ << "): " << printReg(PHIReg, TRI) << "\n");
addLiveOut(PHIReg);
}
}
@@ -830,7 +845,7 @@ void LinearizedRegion::storeLiveOuts(RegionMRT *Region,
if (Region->contains(getPHIPred(PHI, i))) {
unsigned PHIReg = getPHISourceReg(PHI, i);
DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
- << "): " << PrintReg(PHIReg, TRI) << "\n");
+ << "): " << printReg(PHIReg, TRI) << "\n");
addLiveOut(PHIReg);
}
}
@@ -839,6 +854,7 @@ void LinearizedRegion::storeLiveOuts(RegionMRT *Region,
}
}
+#ifndef NDEBUG
void LinearizedRegion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
OS << "Linearized Region {";
bool IsFirst = true;
@@ -852,13 +868,14 @@ void LinearizedRegion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
}
OS << "} (" << Entry->getNumber() << ", "
<< (Exit == nullptr ? -1 : Exit->getNumber())
- << "): In:" << PrintReg(getBBSelectRegIn(), TRI)
- << " Out:" << PrintReg(getBBSelectRegOut(), TRI) << " {";
+ << "): In:" << printReg(getBBSelectRegIn(), TRI)
+ << " Out:" << printReg(getBBSelectRegOut(), TRI) << " {";
for (auto &LI : LiveOuts) {
- OS << PrintReg(LI, TRI) << " ";
+ OS << printReg(LI, TRI) << " ";
}
OS << "} \n";
}
+#endif
unsigned LinearizedRegion::getBBSelectRegIn() {
return getRegionMRT()->getBBSelectRegIn();
@@ -893,8 +910,8 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
assert(Register != NewRegister && "Cannot replace a reg with itself");
DEBUG(dbgs() << "Pepareing to replace register (region): "
- << PrintReg(Register, MRI->getTargetRegisterInfo()) << " with "
- << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
+ << printReg(Register, MRI->getTargetRegisterInfo()) << " with "
+ << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
// If we are replacing outside, we also need to update the LiveOuts
if (ReplaceOutside &&
@@ -930,14 +947,14 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
DEBUG(dbgs() << "Trying to substitute physical register: "
- << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+ << printReg(NewRegister, MRI->getTargetRegisterInfo())
<< "\n");
llvm_unreachable("Cannot substitute physical registers");
} else {
DEBUG(dbgs() << "Replacing register (region): "
- << PrintReg(Register, MRI->getTargetRegisterInfo())
+ << printReg(Register, MRI->getTargetRegisterInfo())
<< " with "
- << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+ << printReg(NewRegister, MRI->getTargetRegisterInfo())
<< "\n");
O.setReg(NewRegister);
}
@@ -1006,16 +1023,16 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
continue;
if (!MRI->hasOneDef(Reg)) {
DEBUG(this->getEntry()->getParent()->dump());
- DEBUG(dbgs() << PrintReg(Reg, TRI) << "\n");
+ DEBUG(dbgs() << printReg(Reg, TRI) << "\n");
}
if (MRI->def_begin(Reg) == MRI->def_end()) {
DEBUG(dbgs() << "Register "
- << PrintReg(Reg, MRI->getTargetRegisterInfo())
+ << printReg(Reg, MRI->getTargetRegisterInfo())
<< " has NO defs\n");
} else if (!MRI->hasOneDef(Reg)) {
DEBUG(dbgs() << "Register "
- << PrintReg(Reg, MRI->getTargetRegisterInfo())
+ << printReg(Reg, MRI->getTargetRegisterInfo())
<< " has multiple defs\n");
}
@@ -1025,7 +1042,7 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB;
if (UseIsOutsideDefMBB && UseOperand->isKill()) {
DEBUG(dbgs() << "Removing kill flag on register: "
- << PrintReg(Reg, TRI) << "\n");
+ << printReg(Reg, TRI) << "\n");
UseOperand->setIsKill(false);
}
}
@@ -1059,7 +1076,7 @@ LinearizedRegion::LinearizedRegion() {
Parent = nullptr;
}
-LinearizedRegion::~LinearizedRegion() {}
+namespace {
class AMDGPUMachineCFGStructurizer : public MachineFunctionPass {
private:
@@ -1070,6 +1087,7 @@ private:
unsigned BBSelectRegister;
PHILinearize PHIInfo;
DenseMap<MachineBasicBlock *, MachineBasicBlock *> FallthroughMap;
+ RegionMRT *RMRT;
void getPHIRegionIndices(RegionMRT *Region, MachineInstr &PHI,
SmallVector<unsigned, 2> &RegionIndices);
@@ -1193,15 +1211,15 @@ private:
public:
static char ID;
+ AMDGPUMachineCFGStructurizer() : MachineFunctionPass(ID) {
+ initializeAMDGPUMachineCFGStructurizerPass(*PassRegistry::getPassRegistry());
+ }
+
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineRegionInfoPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
- AMDGPUMachineCFGStructurizer() : MachineFunctionPass(ID) {
- initializeAMDGPUMachineCFGStructurizerPass(*PassRegistry::getPassRegistry());
- }
-
void initFallthroughMap(MachineFunction &MF);
void createLinearizedRegion(RegionMRT *Region, unsigned SelectOut);
@@ -1210,14 +1228,14 @@ public:
MachineRegisterInfo *MRI,
const SIInstrInfo *TII);
- RegionMRT *RMRT;
void setRegionMRT(RegionMRT *RegionTree) { RMRT = RegionTree; }
RegionMRT *getRegionMRT() { return RMRT; }
bool runOnMachineFunction(MachineFunction &MF) override;
};
-}
+
+} // end anonymous namespace
char AMDGPUMachineCFGStructurizer::ID = 0;
@@ -1254,7 +1272,6 @@ void AMDGPUMachineCFGStructurizer::transformSimpleIfRegion(RegionMRT *Region) {
}
static void fixMBBTerminator(MachineBasicBlock *MBB) {
-
if (MBB->succ_size() == 1) {
auto *Succ = *(MBB->succ_begin());
for (auto &TI : MBB->terminators()) {
@@ -1433,8 +1450,7 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
unsigned *ReplaceReg) {
DEBUG(dbgs() << "Shrink PHI: ");
DEBUG(PHI.dump());
- DEBUG(dbgs() << " to " << PrintReg(getPHIDestReg(PHI), TRI)
- << "<def> = PHI(");
+ DEBUG(dbgs() << " to " << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
bool Replaced = false;
unsigned NumInputs = getPHINumInputs(PHI);
@@ -1464,8 +1480,8 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
if (SourceMBB) {
MIB.addReg(CombinedSourceReg);
MIB.addMBB(SourceMBB);
- DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
- << SourceMBB->getNumber());
+ DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+ << printMBBReference(*SourceMBB));
}
for (unsigned i = 0; i < NumInputs; ++i) {
@@ -1476,8 +1492,8 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
MIB.addReg(SourceReg);
MIB.addMBB(SourcePred);
- DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
- << SourcePred->getNumber());
+ DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+ << printMBBReference(*SourcePred));
}
DEBUG(dbgs() << ")\n");
}
@@ -1490,8 +1506,7 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
SmallVector<unsigned, 2> &PHIRegionIndices) {
DEBUG(dbgs() << "Replace PHI: ");
DEBUG(PHI.dump());
- DEBUG(dbgs() << " with " << PrintReg(getPHIDestReg(PHI), TRI)
- << "<def> = PHI(");
+ DEBUG(dbgs() << " with " << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
bool HasExternalEdge = false;
unsigned NumInputs = getPHINumInputs(PHI);
@@ -1508,8 +1523,8 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
getPHIDestReg(PHI));
MIB.addReg(CombinedSourceReg);
MIB.addMBB(LastMerge);
- DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
- << LastMerge->getNumber());
+ DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+ << printMBBReference(*LastMerge));
for (unsigned i = 0; i < NumInputs; ++i) {
if (isPHIRegionIndex(PHIRegionIndices, i)) {
continue;
@@ -1518,8 +1533,8 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
MIB.addReg(SourceReg);
MIB.addMBB(SourcePred);
- DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
- << SourcePred->getNumber());
+ DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+ << printMBBReference(*SourcePred));
}
DEBUG(dbgs() << ")\n");
} else {
@@ -1531,7 +1546,6 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB,
SmallVector<unsigned, 2> &PHIRegionIndices) {
-
DEBUG(dbgs() << "Replace entry PHI: ");
DEBUG(PHI.dump());
DEBUG(dbgs() << " with ");
@@ -1547,18 +1561,18 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
if (NumNonRegionInputs == 0) {
auto DestReg = getPHIDestReg(PHI);
replaceRegisterWith(DestReg, CombinedSourceReg);
- DEBUG(dbgs() << " register " << PrintReg(CombinedSourceReg, TRI) << "\n");
+ DEBUG(dbgs() << " register " << printReg(CombinedSourceReg, TRI) << "\n");
PHI.eraseFromParent();
} else {
- DEBUG(dbgs() << PrintReg(getPHIDestReg(PHI), TRI) << "<def> = PHI(");
+ DEBUG(dbgs() << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
MachineBasicBlock *MBB = PHI.getParent();
MachineInstrBuilder MIB =
BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
getPHIDestReg(PHI));
MIB.addReg(CombinedSourceReg);
MIB.addMBB(IfMBB);
- DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
- << IfMBB->getNumber());
+ DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+ << printMBBReference(*IfMBB));
unsigned NumInputs = getPHINumInputs(PHI);
for (unsigned i = 0; i < NumInputs; ++i) {
if (isPHIRegionIndex(PHIRegionIndices, i)) {
@@ -1568,8 +1582,8 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
MIB.addReg(SourceReg);
MIB.addMBB(SourcePred);
- DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
- << SourcePred->getNumber());
+ DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+ << printMBBReference(*SourcePred));
}
DEBUG(dbgs() << ")\n");
PHI.eraseFromParent();
@@ -1593,7 +1607,7 @@ void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs(
}
}
- DEBUG(dbgs() << "Register " << PrintReg(Reg, TRI) << " is "
+ DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is "
<< (IsDead ? "dead" : "alive") << " after PHI replace\n");
if (IsDead) {
LRegion->removeLiveOut(Reg);
@@ -1734,11 +1748,11 @@ void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB,
if (MergeBB->succ_begin() == MergeBB->succ_end()) {
return;
}
- DEBUG(dbgs() << "Merge PHI (BB#" << MergeBB->getNumber()
- << "): " << PrintReg(DestRegister, TRI) << "<def> = PHI("
- << PrintReg(IfSourceRegister, TRI) << ", BB#"
- << IfBB->getNumber() << PrintReg(CodeSourceRegister, TRI)
- << ", BB#" << CodeBB->getNumber() << ")\n");
+ DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB)
+ << "): " << printReg(DestRegister, TRI) << " = PHI("
+ << printReg(IfSourceRegister, TRI) << ", "
+ << printMBBReference(*IfBB) << printReg(CodeSourceRegister, TRI)
+ << ", " << printMBBReference(*CodeBB) << ")\n");
const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin());
MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL,
TII->get(TargetOpcode::PHI), DestRegister);
@@ -1796,8 +1810,8 @@ static void removeExternalCFGEdges(MachineBasicBlock *StartMBB,
for (auto SI : Succs) {
std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI;
- DEBUG(dbgs() << "Removing edge: BB#" << Edge.first->getNumber() << " -> BB#"
- << Edge.second->getNumber() << "\n");
+ DEBUG(dbgs() << "Removing edge: " << printMBBReference(*Edge.first)
+ << " -> " << printMBBReference(*Edge.second) << "\n");
Edge.first->removeSuccessor(Edge.second);
}
}
@@ -1835,8 +1849,8 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
if (!CodeBBEnd->isSuccessor(MergeBB))
CodeBBEnd->addSuccessor(MergeBB);
- DEBUG(dbgs() << "Moved MBB#" << CodeBBStart->getNumber() << " through MBB#"
- << CodeBBEnd->getNumber() << "\n");
+ DEBUG(dbgs() << "Moved " << printMBBReference(*CodeBBStart) << " through "
+ << printMBBReference(*CodeBBEnd) << "\n");
// If we have a single predecessor we can find a reasonable debug location
MachineBasicBlock *SinglePred =
@@ -1921,10 +1935,10 @@ void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *Co
MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) {
if (MRI->def_begin(Reg) == MRI->def_end()) {
- DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo())
+ DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo())
<< " has NO defs\n");
} else if (!MRI->hasOneDef(Reg)) {
- DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo())
+ DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo())
<< " has multiple defs\n");
DEBUG(dbgs() << "DEFS BEGIN:\n");
for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) {
@@ -2008,7 +2022,7 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
}
for (auto LI : OldLiveOuts) {
- DEBUG(dbgs() << "LiveOut: " << PrintReg(LI, TRI));
+ DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI));
if (!containsDef(CodeBB, InnerRegion, LI) ||
(!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) {
// If the register simly lives through the CodeBB, we don't have
@@ -2034,7 +2048,7 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
unsigned IfSourceReg = MRI->createVirtualRegister(RegClass);
// Create initializer, this value is never used, but is needed
// to satisfy SSA.
- DEBUG(dbgs() << "Initializer for reg: " << PrintReg(Reg) << "\n");
+ DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n");
TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(),
IfSourceReg, 0);
@@ -2049,7 +2063,7 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
// is a source block for a definition.
SmallVector<unsigned, 4> Sources;
if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) {
- DEBUG(dbgs() << "Inserting PHI Live Out from BB#" << CodeBB->getNumber()
+ DEBUG(dbgs() << "Inserting PHI Live Out from " << printMBBReference(*CodeBB)
<< "\n");
for (auto SI : Sources) {
unsigned DestReg;
@@ -2131,7 +2145,7 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
const DebugLoc &DL = Entry->findDebugLoc(Entry->begin());
MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL,
TII->get(TargetOpcode::PHI), DestReg);
- DEBUG(dbgs() << "Entry PHI " << PrintReg(DestReg, TRI) << "<def> = PHI(");
+ DEBUG(dbgs() << "Entry PHI " << printReg(DestReg, TRI) << " = PHI(");
unsigned CurrentBackedgeReg = 0;
@@ -2156,17 +2170,18 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
BackedgePHI.addMBB((*SRI).second);
CurrentBackedgeReg = NewBackedgeReg;
DEBUG(dbgs() << "Inserting backedge PHI: "
- << PrintReg(NewBackedgeReg, TRI) << "<def> = PHI("
- << PrintReg(CurrentBackedgeReg, TRI) << ", BB#"
- << getPHIPred(*PHIDefInstr, 0)->getNumber() << ", "
- << PrintReg(getPHISourceReg(*PHIDefInstr, 1), TRI)
- << ", BB#" << (*SRI).second->getNumber());
+ << printReg(NewBackedgeReg, TRI) << " = PHI("
+ << printReg(CurrentBackedgeReg, TRI) << ", "
+ << printMBBReference(*getPHIPred(*PHIDefInstr, 0))
+ << ", "
+ << printReg(getPHISourceReg(*PHIDefInstr, 1), TRI)
+ << ", " << printMBBReference(*(*SRI).second));
}
} else {
MIB.addReg(SourceReg);
MIB.addMBB((*SRI).second);
- DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
- << (*SRI).second->getNumber() << ", ");
+ DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+ << printMBBReference(*(*SRI).second) << ", ");
}
}
@@ -2174,8 +2189,8 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
if (CurrentBackedgeReg != 0) {
MIB.addReg(CurrentBackedgeReg);
MIB.addMBB(Exit);
- DEBUG(dbgs() << PrintReg(CurrentBackedgeReg, TRI) << ", BB#"
- << Exit->getNumber() << ")\n");
+ DEBUG(dbgs() << printReg(CurrentBackedgeReg, TRI) << ", "
+ << printMBBReference(*Exit) << ")\n");
} else {
DEBUG(dbgs() << ")\n");
}
@@ -2205,7 +2220,7 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
++I;
if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
DEBUG(dbgs() << "Trying to substitute physical register: "
- << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+ << printReg(NewRegister, MRI->getTargetRegisterInfo())
<< "\n");
llvm_unreachable("Cannot substitute physical registers");
// We don't handle physical registers, but if we need to
@@ -2213,9 +2228,9 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
// O.substPhysReg(NewRegister, *TRI);
} else {
DEBUG(dbgs() << "Replacing register: "
- << PrintReg(Register, MRI->getTargetRegisterInfo())
+ << printReg(Register, MRI->getTargetRegisterInfo())
<< " with "
- << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+ << printReg(NewRegister, MRI->getTargetRegisterInfo())
<< "\n");
O.setReg(NewRegister);
}
@@ -2233,11 +2248,11 @@ void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEn
for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
++DRI) {
unsigned DestReg = *DRI;
- DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) << "\n");
+ DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) << "\n");
auto SRI = PHIInfo.sources_begin(DestReg);
unsigned SourceReg = (*SRI).first;
- DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI)
- << " SourceReg: " << PrintReg(SourceReg, TRI) << "\n");
+ DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI)
+ << " SourceReg: " << printReg(SourceReg, TRI) << "\n");
assert(PHIInfo.sources_end(DestReg) == ++SRI &&
"More than one phi source in entry node");
@@ -2424,15 +2439,15 @@ void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI,
MachineInstrBuilder MIB =
BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(),
TII->get(TargetOpcode::PHI), NewDestReg);
- DEBUG(dbgs() << "Split Entry PHI " << PrintReg(NewDestReg, TRI)
- << "<def> = PHI(");
+ DEBUG(dbgs() << "Split Entry PHI " << printReg(NewDestReg, TRI) << " = PHI(");
MIB.addReg(PHISource);
MIB.addMBB(Entry);
- DEBUG(dbgs() << PrintReg(PHISource, TRI) << ", BB#" << Entry->getNumber());
+ DEBUG(dbgs() << printReg(PHISource, TRI) << ", "
+ << printMBBReference(*Entry));
MIB.addReg(RegionSourceReg);
MIB.addMBB(RegionSourceMBB);
- DEBUG(dbgs() << " ," << PrintReg(RegionSourceReg, TRI) << ", BB#"
- << RegionSourceMBB->getNumber() << ")\n");
+ DEBUG(dbgs() << " ," << printReg(RegionSourceReg, TRI) << ", "
+ << printMBBReference(*RegionSourceMBB) << ")\n");
}
void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry,
@@ -2487,7 +2502,6 @@ AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) {
return NewExit;
}
-
static MachineBasicBlock *split(MachineBasicBlock::iterator I) {
// Create the fall-through block.
MachineBasicBlock *MBB = (*I).getParent();
@@ -2514,9 +2528,9 @@ AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) {
MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI());
MachineBasicBlock *Exit = LRegion->getExit();
- DEBUG(dbgs() << "Split BB#" << Entry->getNumber() << " to BB#"
- << Entry->getNumber() << " -> BB#" << EntrySucc->getNumber()
- << "\n");
+ DEBUG(dbgs() << "Split " << printMBBReference(*Entry) << " to "
+ << printMBBReference(*Entry) << " -> "
+ << printMBBReference(*EntrySucc) << "\n");
LRegion->addMBB(EntrySucc);
// Make the backedge go to Entry Succ
@@ -2655,9 +2669,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
BBSelectRegOut = Child->getBBSelectRegOut();
BBSelectRegIn = Child->getBBSelectRegIn();
- DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI)
+ DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
<< "\n");
- DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI)
+ DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
<< "\n");
MachineBasicBlock *IfEnd = CurrentMerge;
@@ -2679,9 +2693,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
BBSelectRegOut = Child->getBBSelectRegOut();
BBSelectRegIn = Child->getBBSelectRegIn();
- DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI)
+ DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
<< "\n");
- DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI)
+ DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
<< "\n");
MachineBasicBlock *IfEnd = CurrentMerge;
@@ -2786,7 +2800,7 @@ void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region,
LinearizedRegion *LRegion = new LinearizedRegion();
if (SelectOut) {
LRegion->addLiveOut(SelectOut);
- DEBUG(dbgs() << "Add LiveOut (BBSelect): " << PrintReg(SelectOut, TRI)
+ DEBUG(dbgs() << "Add LiveOut (BBSelect): " << printReg(SelectOut, TRI)
<< "\n");
}
LRegion->setRegionMRT(Region);
@@ -2841,16 +2855,6 @@ static void checkRegOnlyPHIInputs(MachineFunction &MF) {
}
}
-
-INITIALIZE_PASS_BEGIN(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
- "AMDGPU Machine CFG Structurizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineRegionInfoPass)
-INITIALIZE_PASS_END(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
- "AMDGPU Machine CFG Structurizer", false, false)
-
-char AMDGPUMachineCFGStructurizerID = AMDGPUMachineCFGStructurizer::ID;
-
-
bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -2876,6 +2880,14 @@ bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) {
return result;
}
+char AMDGPUMachineCFGStructurizerID = AMDGPUMachineCFGStructurizer::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
+ "AMDGPU Machine CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineRegionInfoPass)
+INITIALIZE_PASS_END(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
+ "AMDGPU Machine CFG Structurizer", false, false)
+
FunctionPass *llvm::createAMDGPUMachineCFGStructurizerPass() {
return new AMDGPUMachineCFGStructurizer();
}
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 9fb7f5f88927..b7c8c1213537 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -19,7 +19,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MaxKernArgAlign(0),
LDSSize(0),
ABIArgOffset(0),
- IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction()->getCallingConv())),
+ IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
// FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
// except reserved size is not correctly aligned.
diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
new file mode 100644
index 000000000000..3164140abe29
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -0,0 +1,29 @@
+//===--- AMDGPUMachineModuleInfo.cpp ----------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Machine Module Info.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMachineModuleInfo.h"
+#include "llvm/IR/Module.h"
+
+namespace llvm {
+
+AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
+ : MachineModuleInfoELF(MMI) {
+ LLVMContext &CTX = MMI.getModule()->getContext();
+ AgentSSID = CTX.getOrInsertSyncScopeID("agent");
+ WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup");
+ WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront");
+}
+
+} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
new file mode 100644
index 000000000000..1a728c6bd04a
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -0,0 +1,97 @@
+//===--- AMDGPUMachineModuleInfo.h ------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Machine Module Info.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H
+
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/LLVMContext.h"
+
+namespace llvm {
+
+class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
+private:
+
+ // All supported memory/synchronization scopes can be found here:
+ // http://llvm.org/docs/AMDGPUUsage.html#memory-scopes
+
+ /// \brief Agent synchronization scope ID.
+ SyncScope::ID AgentSSID;
+ /// \brief Workgroup synchronization scope ID.
+ SyncScope::ID WorkgroupSSID;
+ /// \brief Wavefront synchronization scope ID.
+ SyncScope::ID WavefrontSSID;
+
+ /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a
+ /// larger synchronization scope is inclusive of a smaller synchronization
+ /// scope.
+ ///
+ /// \returns \p SSID's inclusion ordering, or "None" if \p SSID is not
+ /// supported by the AMDGPU target.
+ Optional<uint8_t> getSyncScopeInclusionOrdering(SyncScope::ID SSID) const {
+ if (SSID == SyncScope::SingleThread)
+ return 0;
+ else if (SSID == getWavefrontSSID())
+ return 1;
+ else if (SSID == getWorkgroupSSID())
+ return 2;
+ else if (SSID == getAgentSSID())
+ return 3;
+ else if (SSID == SyncScope::System)
+ return 4;
+
+ return None;
+ }
+
+public:
+ AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI);
+
+ /// \returns Agent synchronization scope ID.
+ SyncScope::ID getAgentSSID() const {
+ return AgentSSID;
+ }
+ /// \returns Workgroup synchronization scope ID.
+ SyncScope::ID getWorkgroupSSID() const {
+ return WorkgroupSSID;
+ }
+ /// \returns Wavefront synchronization scope ID.
+ SyncScope::ID getWavefrontSSID() const {
+ return WavefrontSSID;
+ }
+
+ /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a
+ /// larger synchronization scope is inclusive of a smaller synchronization
+ /// scope.
+ ///
+ /// \returns True if synchronization scope \p A is larger than or equal to
+ /// synchronization scope \p B, false if synchronization scope \p A is smaller
+ /// than synchronization scope \p B, or "None" if either synchronization scope
+ /// \p A or \p B is not supported by the AMDGPU target.
+ Optional<bool> isSyncScopeInclusion(SyncScope::ID A, SyncScope::ID B) const {
+ const auto &AIO = getSyncScopeInclusionOrdering(A);
+ const auto &BIO = getSyncScopeInclusionOrdering(B);
+ if (!AIO || !BIO)
+ return None;
+
+ return AIO.getValue() > BIO.getValue();
+ }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
new file mode 100644
index 000000000000..bb65636f15af
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -0,0 +1,135 @@
+//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// \brief This post-linking pass replaces the function pointer of enqueued
+// block kernel with a global variable (runtime handle) and adds
+// "runtime-handle" attribute to the enqueued block kernel.
+//
+// In LLVM CodeGen the runtime-handle metadata will be translated to
+// RuntimeHandle metadata in code object. Runtime allocates a global buffer
+// for each kernel with RuntimeHandel metadata and saves the kernel address
+// required for the AQL packet into the buffer. __enqueue_kernel function
+// in device library knows that the invoke function pointer in the block
+// literal is actually runtime handle and loads the kernel address from it
+// and put it into AQL packet for dispatching.
+//
+// This cannot be done in FE since FE cannot create a unique global variable
+// with external linkage across LLVM modules. The global variable with internal
+// linkage does not work since optimization passes will try to replace loads
+// of the global variable with its initialization value.
+//
+// It also identifies the kernels directly or indirectly enqueues kernels
+// and adds "calls-enqueue-kernel" function attribute to them, which will
+// be used to determine whether to emit runtime metadata for the kernel
+// enqueue related hidden kernel arguments.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/User.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-lower-enqueued-block"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief Lower enqueued blocks.
+class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
+public:
+ static char ID;
+
+ explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}
+
+private:
+ bool runOnModule(Module &M) override;
+};
+
+} // end anonymous namespace
+
+char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0;
+
+char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID =
+ AMDGPUOpenCLEnqueuedBlockLowering::ID;
+
+INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
+ "Lower OpenCL enqueued blocks", false, false)
+
+ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
+ return new AMDGPUOpenCLEnqueuedBlockLowering();
+}
+
+/// Collect direct or indrect callers of \p F and save them
+/// to \p Callers.
+static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
+ for (auto U : F->users()) {
+ if (auto *CI = dyn_cast<CallInst>(&*U)) {
+ auto *Caller = CI->getParent()->getParent();
+ if (Callers.count(Caller))
+ continue;
+ Callers.insert(Caller);
+ collectCallers(Caller, Callers);
+ }
+ }
+}
+
+bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
+ DenseSet<Function *> Callers;
+ auto &C = M.getContext();
+ bool Changed = false;
+ for (auto &F : M.functions()) {
+ if (F.hasFnAttribute("enqueued-block")) {
+ if (!F.hasOneUse() || !F.user_begin()->hasOneUse() ||
+ !isa<ConstantExpr>(*F.user_begin()) ||
+ !isa<ConstantExpr>(*F.user_begin()->user_begin())) {
+ continue;
+ }
+ auto *BitCast = cast<ConstantExpr>(*F.user_begin());
+ auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin());
+ auto RuntimeHandle = (F.getName() + "_runtime_handle").str();
+ auto *GV = new GlobalVariable(
+ M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS),
+ /*IsConstant=*/true, GlobalValue::ExternalLinkage,
+ /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr,
+ GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS,
+ /*IsExternallyInitialized=*/true);
+ DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
+ auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType());
+ AddrCast->replaceAllUsesWith(NewPtr);
+ F.addFnAttr("runtime-handle", RuntimeHandle);
+ F.setLinkage(GlobalValue::ExternalLinkage);
+
+ // Collect direct or indirect callers of enqueue_kernel.
+ for (auto U : NewPtr->users()) {
+ if (auto *I = dyn_cast<Instruction>(&*U)) {
+ auto *F = I->getParent()->getParent();
+ Callers.insert(F);
+ collectCallers(F, Callers);
+ }
+ }
+ Changed = true;
+ }
+ }
+
+ for (auto F : Callers) {
+ if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
+ continue;
+ F->addFnAttr("calls-enqueue-kernel");
+ }
+ return Changed;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
index 410bd52d9c21..cd71f19760b9 100644
--- a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUOpenCLImageTypeLoweringPass.cpp -----------------------------===//
+//===- AMDGPUOpenCLImageTypeLoweringPass.cpp ------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -22,40 +22,57 @@
/// Resource IDs of read-only images, write-only images and samplers are
/// defined to be their index among the kernel arguments of the same
/// type and access qualifier.
+//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/Passes.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
using namespace llvm;
-namespace {
-
-StringRef GetImageSizeFunc = "llvm.OpenCL.image.get.size";
-StringRef GetImageFormatFunc = "llvm.OpenCL.image.get.format";
-StringRef GetImageResourceIDFunc = "llvm.OpenCL.image.get.resource.id";
-StringRef GetSamplerResourceIDFunc = "llvm.OpenCL.sampler.get.resource.id";
+static StringRef GetImageSizeFunc = "llvm.OpenCL.image.get.size";
+static StringRef GetImageFormatFunc = "llvm.OpenCL.image.get.format";
+static StringRef GetImageResourceIDFunc = "llvm.OpenCL.image.get.resource.id";
+static StringRef GetSamplerResourceIDFunc =
+ "llvm.OpenCL.sampler.get.resource.id";
-StringRef ImageSizeArgMDType = "__llvm_image_size";
-StringRef ImageFormatArgMDType = "__llvm_image_format";
+static StringRef ImageSizeArgMDType = "__llvm_image_size";
+static StringRef ImageFormatArgMDType = "__llvm_image_format";
-StringRef KernelsMDNodeName = "opencl.kernels";
-StringRef KernelArgMDNodeNames[] = {
+static StringRef KernelsMDNodeName = "opencl.kernels";
+static StringRef KernelArgMDNodeNames[] = {
"kernel_arg_addr_space",
"kernel_arg_access_qual",
"kernel_arg_type",
"kernel_arg_base_type",
"kernel_arg_type_qual"};
-const unsigned NumKernelArgMDNodes = 5;
+static const unsigned NumKernelArgMDNodes = 5;
+
+namespace {
-typedef SmallVector<Metadata *, 8> MDVector;
+using MDVector = SmallVector<Metadata *, 8>;
struct KernelArgMD {
MDVector ArgVector[NumKernelArgMDNodes];
};
@@ -303,7 +320,7 @@ class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns);
// Build new MDNode.
- SmallVector<llvm::Metadata *, 6> KernelMDArgs;
+ SmallVector<Metadata *, 6> KernelMDArgs;
KernelMDArgs.push_back(ConstantAsMetadata::get(NewF));
for (unsigned i = 0; i < NumKernelArgMDNodes; ++i)
KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i]));
@@ -346,7 +363,7 @@ class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
return Modified;
}
- public:
+public:
AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {}
bool runOnModule(Module &M) override {
@@ -363,10 +380,10 @@ class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
}
};
-char AMDGPUOpenCLImageTypeLoweringPass::ID = 0;
-
} // end anonymous namespace
+char AMDGPUOpenCLImageTypeLoweringPass::ID = 0;
+
ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() {
return new AMDGPUOpenCLImageTypeLoweringPass();
}
diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h
index 71b9ab699b96..b50a2eb8e9e7 100644
--- a/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -25,18 +25,22 @@ const char SectionName[] = ".note";
const char NoteName[] = "AMD";
-// TODO: Move this enum to include/llvm/Support so it can be used in tools?
+// TODO: Remove this file once we drop code object v2.
enum NoteType{
+ NT_AMDGPU_HSA_RESERVED_0 = 0,
NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1,
NT_AMDGPU_HSA_HSAIL = 2,
NT_AMDGPU_HSA_ISA = 3,
NT_AMDGPU_HSA_PRODUCER = 4,
NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5,
NT_AMDGPU_HSA_EXTENSION = 6,
- NT_AMDGPU_HSA_CODE_OBJECT_METADATA = 10,
+ NT_AMDGPU_HSA_RESERVED_7 = 7,
+ NT_AMDGPU_HSA_RESERVED_8 = 8,
+ NT_AMDGPU_HSA_RESERVED_9 = 9,
NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101,
NT_AMDGPU_HSA_HLDEBUG_TARGET = 102
};
+
}
}
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 625c9b77e2de..41876ed45c8c 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -285,9 +285,9 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
return CI;
}
-static VectorType *arrayTypeToVecType(Type *ArrayTy) {
- return VectorType::get(ArrayTy->getArrayElementType(),
- ArrayTy->getArrayNumElements());
+static VectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
+ return VectorType::get(ArrayTy->getElementType(),
+ ArrayTy->getNumElements());
}
static Value *
@@ -346,10 +346,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
// could also be promoted but we don't currently handle this case
if (!AllocaTy ||
- AllocaTy->getElementType()->isVectorTy() ||
- AllocaTy->getElementType()->isArrayTy() ||
AllocaTy->getNumElements() > 4 ||
- AllocaTy->getNumElements() < 2) {
+ AllocaTy->getNumElements() < 2 ||
+ !VectorType::isValidElementType(AllocaTy->getElementType())) {
DEBUG(dbgs() << " Cannot convert type to vector\n");
return false;
}
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 623b2c88ab8f..1ed02fae085a 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -17,9 +17,9 @@
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/Constants.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
#define GET_TARGET_REGBANK_IMPL
#include "AMDGPUGenRegisterBank.inc"
@@ -29,10 +29,6 @@
using namespace llvm;
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-#error "You shouldn't build this"
-#endif
-
AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
: AMDGPUGenRegisterBankInfo(),
TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index ff58aa5741a1..5e4d33aaa691 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -43,10 +43,11 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
// Forced to be here by one .inc
const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
const MachineFunction *MF) const {
- CallingConv::ID CC = MF->getFunction()->getCallingConv();
+ CallingConv::ID CC = MF->getFunction().getCallingConv();
switch (CC) {
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::Cold:
return CSR_AMDGPU_HighRegs_SaveList;
default: {
// Dummy to not crash RegisterClassInfo.
@@ -56,11 +57,17 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
}
}
+const MCPhysReg *
+SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
+ return nullptr;
+}
+
const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
switch (CC) {
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::Cold:
return CSR_AMDGPU_HighRegs_RegMask;
default:
return nullptr;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
index ba0490abee8c..3bbcba826f63 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
@@ -17,8 +17,6 @@ foreach Index = 0-15 in {
def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
}
-def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">;
-
}
include "R600RegisterInfo.td"
diff --git a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
new file mode 100644
index 000000000000..83e56a9ab495
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -0,0 +1,483 @@
+//===- AMDGPURewriteOutArgumentsPass.cpp - Create struct returns ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass attempts to replace out argument usage with a return of a
+/// struct.
+///
+/// We can support returning a lot of values directly in registers, but
+/// idiomatic C code frequently uses a pointer argument to return a second value
+/// rather than returning a struct by value. GPU stack access is also quite
+/// painful, so we want to avoid that if possible. Passing a stack object
+/// pointer to a function also requires an additional address expansion code
+/// sequence to convert the pointer to be relative to the kernel's scratch wave
+/// offset register since the callee doesn't know what stack frame the incoming
+/// pointer is relative to.
+///
+/// The goal is to try rewriting code that looks like this:
+///
+/// int foo(int a, int b, int* out) {
+/// *out = bar();
+/// return a + b;
+/// }
+///
+/// into something like this:
+///
+/// std::pair<int, int> foo(int a, int b) {
+/// return std::make_pair(a + b, bar());
+/// }
+///
+/// Typically the incoming pointer is a simple alloca for a temporary variable
+/// to use the API, which if replaced with a struct return will be easily SROA'd
+/// out when the stub function we create is inlined
+///
+/// This pass introduces the struct return, but leaves the unused pointer
+/// arguments and introduces a new stub function calling the struct returning
+/// body. DeadArgumentElimination should be run after this to clean these up.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <utility>
+
+#define DEBUG_TYPE "amdgpu-rewrite-out-arguments"
+
+using namespace llvm;
+
+static cl::opt<bool> AnyAddressSpace(
+ "amdgpu-any-address-space-out-arguments",
+ cl::desc("Replace pointer out arguments with "
+ "struct returns for non-private address space"),
+ cl::Hidden,
+ cl::init(false));
+
+static cl::opt<unsigned> MaxNumRetRegs(
+ "amdgpu-max-return-arg-num-regs",
+ cl::desc("Approximately limit number of return registers for replacing out arguments"),
+ cl::Hidden,
+ cl::init(16));
+
+STATISTIC(NumOutArgumentsReplaced,
+ "Number out arguments moved to struct return values");
+STATISTIC(NumOutArgumentFunctionsReplaced,
+ "Number of functions with out arguments moved to struct return values");
+
+namespace {
+
+class AMDGPURewriteOutArguments : public FunctionPass {
+private:
+ const DataLayout *DL = nullptr;
+ MemoryDependenceResults *MDA = nullptr;
+
+ bool checkArgumentUses(Value &Arg) const;
+ bool isOutArgumentCandidate(Argument &Arg) const;
+
+#ifndef NDEBUG
+ bool isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const;
+#endif
+
+public:
+ static char ID;
+
+ AMDGPURewriteOutArguments() : FunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MemoryDependenceWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(AMDGPURewriteOutArguments, DEBUG_TYPE,
+ "AMDGPU Rewrite Out Arguments", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_END(AMDGPURewriteOutArguments, DEBUG_TYPE,
+ "AMDGPU Rewrite Out Arguments", false, false)
+
+char AMDGPURewriteOutArguments::ID = 0;
+
+bool AMDGPURewriteOutArguments::checkArgumentUses(Value &Arg) const {
+ const int MaxUses = 10;
+ int UseCount = 0;
+
+ for (Use &U : Arg.uses()) {
+ StoreInst *SI = dyn_cast<StoreInst>(U.getUser());
+ if (UseCount > MaxUses)
+ return false;
+
+ if (!SI) {
+ auto *BCI = dyn_cast<BitCastInst>(U.getUser());
+ if (!BCI || !BCI->hasOneUse())
+ return false;
+
+ // We don't handle multiple stores currently, so stores to aggregate
+ // pointers aren't worth the trouble since they are canonically split up.
+ Type *DestEltTy = BCI->getType()->getPointerElementType();
+ if (DestEltTy->isAggregateType())
+ return false;
+
+ // We could handle these if we had a convenient way to bitcast between
+ // them.
+ Type *SrcEltTy = Arg.getType()->getPointerElementType();
+ if (SrcEltTy->isArrayTy())
+ return false;
+
+ // Special case handle structs with single members. It is useful to handle
+ // some casts between structs and non-structs, but we can't bitcast
+ // directly between them. directly bitcast between them. Blender uses
+ // some casts that look like { <3 x float> }* to <4 x float>*
+ if ((SrcEltTy->isStructTy() && (SrcEltTy->getNumContainedTypes() != 1)))
+ return false;
+
+ // Clang emits OpenCL 3-vector type accesses with a bitcast to the
+ // equivalent 4-element vector and accesses that, and we're looking for
+ // this pointer cast.
+ if (DL->getTypeAllocSize(SrcEltTy) != DL->getTypeAllocSize(DestEltTy))
+ return false;
+
+ return checkArgumentUses(*BCI);
+ }
+
+ if (!SI->isSimple() ||
+ U.getOperandNo() != StoreInst::getPointerOperandIndex())
+ return false;
+
+ ++UseCount;
+ }
+
+ // Skip unused arguments.
+ return UseCount > 0;
+}
+
+bool AMDGPURewriteOutArguments::isOutArgumentCandidate(Argument &Arg) const {
+ const unsigned MaxOutArgSizeBytes = 4 * MaxNumRetRegs;
+ PointerType *ArgTy = dyn_cast<PointerType>(Arg.getType());
+
+ // TODO: It might be useful for any out arguments, not just privates.
+ if (!ArgTy || (ArgTy->getAddressSpace() != DL->getAllocaAddrSpace() &&
+ !AnyAddressSpace) ||
+ Arg.hasByValAttr() || Arg.hasStructRetAttr() ||
+ DL->getTypeStoreSize(ArgTy->getPointerElementType()) > MaxOutArgSizeBytes) {
+ return false;
+ }
+
+ return checkArgumentUses(Arg);
+}
+
+bool AMDGPURewriteOutArguments::doInitialization(Module &M) {
+ DL = &M.getDataLayout();
+ return false;
+}
+
+#ifndef NDEBUG
+bool AMDGPURewriteOutArguments::isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const {
+ VectorType *VT0 = dyn_cast<VectorType>(Ty0);
+ VectorType *VT1 = dyn_cast<VectorType>(Ty1);
+ if (!VT0 || !VT1)
+ return false;
+
+ if (VT0->getNumElements() != 3 ||
+ VT1->getNumElements() != 4)
+ return false;
+
+ return DL->getTypeSizeInBits(VT0->getElementType()) ==
+ DL->getTypeSizeInBits(VT1->getElementType());
+}
+#endif
+
+bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ // TODO: Could probably handle variadic functions.
+ if (F.isVarArg() || F.hasStructRetAttr() ||
+ AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+ return false;
+
+ MDA = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+
+ unsigned ReturnNumRegs = 0;
+ SmallSet<int, 4> OutArgIndexes;
+ SmallVector<Type *, 4> ReturnTypes;
+ Type *RetTy = F.getReturnType();
+ if (!RetTy->isVoidTy()) {
+ ReturnNumRegs = DL->getTypeStoreSize(RetTy) / 4;
+
+ if (ReturnNumRegs >= MaxNumRetRegs)
+ return false;
+
+ ReturnTypes.push_back(RetTy);
+ }
+
+ SmallVector<Argument *, 4> OutArgs;
+ for (Argument &Arg : F.args()) {
+ if (isOutArgumentCandidate(Arg)) {
+ DEBUG(dbgs() << "Found possible out argument " << Arg
+ << " in function " << F.getName() << '\n');
+ OutArgs.push_back(&Arg);
+ }
+ }
+
+ if (OutArgs.empty())
+ return false;
+
+ using ReplacementVec = SmallVector<std::pair<Argument *, Value *>, 4>;
+
+ DenseMap<ReturnInst *, ReplacementVec> Replacements;
+
+ SmallVector<ReturnInst *, 4> Returns;
+ for (BasicBlock &BB : F) {
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back()))
+ Returns.push_back(RI);
+ }
+
+ if (Returns.empty())
+ return false;
+
+ bool Changing;
+
+ do {
+ Changing = false;
+
+ // Keep retrying if we are able to successfully eliminate an argument. This
+ // helps with cases with multiple arguments which may alias, such as in a
+ // sincos implemntation. If we have 2 stores to arguments, on the first
+ // attempt the MDA query will succeed for the second store but not the
+ // first. On the second iteration we've removed that out clobbering argument
+ // (by effectively moving it into another function) and will find the second
+ // argument is OK to move.
+ for (Argument *OutArg : OutArgs) {
+ bool ThisReplaceable = true;
+ SmallVector<std::pair<ReturnInst *, StoreInst *>, 4> ReplaceableStores;
+
+ Type *ArgTy = OutArg->getType()->getPointerElementType();
+
+ // Skip this argument if converting it will push us over the register
+ // count to return limit.
+
+ // TODO: This is an approximation. When legalized this could be more. We
+ // can ask TLI for exactly how many.
+ unsigned ArgNumRegs = DL->getTypeStoreSize(ArgTy) / 4;
+ if (ArgNumRegs + ReturnNumRegs > MaxNumRetRegs)
+ continue;
+
+ // An argument is convertible only if all exit blocks are able to replace
+ // it.
+ for (ReturnInst *RI : Returns) {
+ BasicBlock *BB = RI->getParent();
+
+ MemDepResult Q = MDA->getPointerDependencyFrom(MemoryLocation(OutArg),
+ true, BB->end(), BB, RI);
+ StoreInst *SI = nullptr;
+ if (Q.isDef())
+ SI = dyn_cast<StoreInst>(Q.getInst());
+
+ if (SI) {
+ DEBUG(dbgs() << "Found out argument store: " << *SI << '\n');
+ ReplaceableStores.emplace_back(RI, SI);
+ } else {
+ ThisReplaceable = false;
+ break;
+ }
+ }
+
+ if (!ThisReplaceable)
+ continue; // Try the next argument candidate.
+
+ for (std::pair<ReturnInst *, StoreInst *> Store : ReplaceableStores) {
+ Value *ReplVal = Store.second->getValueOperand();
+
+ auto &ValVec = Replacements[Store.first];
+ if (llvm::find_if(ValVec,
+ [OutArg](const std::pair<Argument *, Value *> &Entry) {
+ return Entry.first == OutArg;}) != ValVec.end()) {
+ DEBUG(dbgs() << "Saw multiple out arg stores" << *OutArg << '\n');
+ // It is possible to see stores to the same argument multiple times,
+ // but we expect these would have been optimized out already.
+ ThisReplaceable = false;
+ break;
+ }
+
+ ValVec.emplace_back(OutArg, ReplVal);
+ Store.second->eraseFromParent();
+ }
+
+ if (ThisReplaceable) {
+ ReturnTypes.push_back(ArgTy);
+ OutArgIndexes.insert(OutArg->getArgNo());
+ ++NumOutArgumentsReplaced;
+ Changing = true;
+ }
+ }
+ } while (Changing);
+
+ if (Replacements.empty())
+ return false;
+
+ LLVMContext &Ctx = F.getParent()->getContext();
+ StructType *NewRetTy = StructType::create(Ctx, ReturnTypes, F.getName());
+
+ FunctionType *NewFuncTy = FunctionType::get(NewRetTy,
+ F.getFunctionType()->params(),
+ F.isVarArg());
+
+ DEBUG(dbgs() << "Computed new return type: " << *NewRetTy << '\n');
+
+ Function *NewFunc = Function::Create(NewFuncTy, Function::PrivateLinkage,
+ F.getName() + ".body");
+ F.getParent()->getFunctionList().insert(F.getIterator(), NewFunc);
+ NewFunc->copyAttributesFrom(&F);
+ NewFunc->setComdat(F.getComdat());
+
+ // We want to preserve the function and param attributes, but need to strip
+ // off any return attributes, e.g. zeroext doesn't make sense with a struct.
+ NewFunc->stealArgumentListFrom(F);
+
+ AttrBuilder RetAttrs;
+ RetAttrs.addAttribute(Attribute::SExt);
+ RetAttrs.addAttribute(Attribute::ZExt);
+ RetAttrs.addAttribute(Attribute::NoAlias);
+ NewFunc->removeAttributes(AttributeList::ReturnIndex, RetAttrs);
+ // TODO: How to preserve metadata?
+
+ // Move the body of the function into the new rewritten function, and replace
+ // this function with a stub.
+ NewFunc->getBasicBlockList().splice(NewFunc->begin(), F.getBasicBlockList());
+
+ for (std::pair<ReturnInst *, ReplacementVec> &Replacement : Replacements) {
+ ReturnInst *RI = Replacement.first;
+ IRBuilder<> B(RI);
+ B.SetCurrentDebugLocation(RI->getDebugLoc());
+
+ int RetIdx = 0;
+ Value *NewRetVal = UndefValue::get(NewRetTy);
+
+ Value *RetVal = RI->getReturnValue();
+ if (RetVal)
+ NewRetVal = B.CreateInsertValue(NewRetVal, RetVal, RetIdx++);
+
+ for (std::pair<Argument *, Value *> ReturnPoint : Replacement.second) {
+ Argument *Arg = ReturnPoint.first;
+ Value *Val = ReturnPoint.second;
+ Type *EltTy = Arg->getType()->getPointerElementType();
+ if (Val->getType() != EltTy) {
+ Type *EffectiveEltTy = EltTy;
+ if (StructType *CT = dyn_cast<StructType>(EltTy)) {
+ assert(CT->getNumContainedTypes() == 1);
+ EffectiveEltTy = CT->getContainedType(0);
+ }
+
+ if (DL->getTypeSizeInBits(EffectiveEltTy) !=
+ DL->getTypeSizeInBits(Val->getType())) {
+ assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType()));
+ Val = B.CreateShuffleVector(Val, UndefValue::get(Val->getType()),
+ { 0, 1, 2 });
+ }
+
+ Val = B.CreateBitCast(Val, EffectiveEltTy);
+
+ // Re-create single element composite.
+ if (EltTy != EffectiveEltTy)
+ Val = B.CreateInsertValue(UndefValue::get(EltTy), Val, 0);
+ }
+
+ NewRetVal = B.CreateInsertValue(NewRetVal, Val, RetIdx++);
+ }
+
+ if (RetVal)
+ RI->setOperand(0, NewRetVal);
+ else {
+ B.CreateRet(NewRetVal);
+ RI->eraseFromParent();
+ }
+ }
+
+ SmallVector<Value *, 16> StubCallArgs;
+ for (Argument &Arg : F.args()) {
+ if (OutArgIndexes.count(Arg.getArgNo())) {
+ // It's easier to preserve the type of the argument list. We rely on
+ // DeadArgumentElimination to take care of these.
+ StubCallArgs.push_back(UndefValue::get(Arg.getType()));
+ } else {
+ StubCallArgs.push_back(&Arg);
+ }
+ }
+
+ BasicBlock *StubBB = BasicBlock::Create(Ctx, "", &F);
+ IRBuilder<> B(StubBB);
+ CallInst *StubCall = B.CreateCall(NewFunc, StubCallArgs);
+
+ int RetIdx = RetTy->isVoidTy() ? 0 : 1;
+ for (Argument &Arg : F.args()) {
+ if (!OutArgIndexes.count(Arg.getArgNo()))
+ continue;
+
+ PointerType *ArgType = cast<PointerType>(Arg.getType());
+
+ auto *EltTy = ArgType->getElementType();
+ unsigned Align = Arg.getParamAlignment();
+ if (Align == 0)
+ Align = DL->getABITypeAlignment(EltTy);
+
+ Value *Val = B.CreateExtractValue(StubCall, RetIdx++);
+ Type *PtrTy = Val->getType()->getPointerTo(ArgType->getAddressSpace());
+
+ // We can peek through bitcasts, so the type may not match.
+ Value *PtrVal = B.CreateBitCast(&Arg, PtrTy);
+
+ B.CreateAlignedStore(Val, PtrVal, Align);
+ }
+
+ if (!RetTy->isVoidTy()) {
+ B.CreateRet(B.CreateExtractValue(StubCall, 0));
+ } else {
+ B.CreateRetVoid();
+ }
+
+ // The function is now a stub we want to inline.
+ F.addFnAttr(Attribute::AlwaysInline);
+
+ ++NumOutArgumentFunctionsReplaced;
+ return true;
+}
+
+FunctionPass *llvm::createAMDGPURewriteOutArgumentsPass() {
+ return new AMDGPURewriteOutArguments();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 779617629010..80feaa44766f 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -15,17 +15,15 @@
#include "AMDGPUSubtarget.h"
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
-#ifdef LLVM_BUILD_GLOBAL_ISEL
#include "AMDGPUCallLowering.h"
#include "AMDGPUInstructionSelector.h"
#include "AMDGPULegalizerInfo.h"
#include "AMDGPURegisterBankInfo.h"
-#endif
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/IR/MDBuilder.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include <algorithm>
using namespace llvm;
@@ -50,14 +48,27 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
// for SI has the unhelpful behavior that it unsets everything else if you
// disable it.
- SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
+ SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
+
if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
- FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
+ FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
+
+ // FIXME: I don't think think Evergreen has any useful support for
+ // denormals, but should be checked. Should we issue a warning somewhere
+ // if someone tries to enable these?
+ if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ FullFS += "+fp64-fp16-denormals,";
+ } else {
+ FullFS += "-fp32-denormals,";
+ }
FullFS += FS;
ParseSubtargetFeatures(GPU, FullFS);
+ // We don't support FP64 for EG/NI atm.
+ assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
+
// Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
// on VI and newer hardware to avoid assertion failures due to missing ADDR64
// variants of MUBUF instructions.
@@ -65,45 +76,24 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
FlatForGlobal = true;
}
- // FIXME: I don't think think Evergreen has any useful support for
- // denormals, but should be checked. Should we issue a warning somewhere
- // if someone tries to enable these?
- if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
- FP64FP16Denormals = false;
- FP32Denormals = false;
- }
-
// Set defaults if needed.
if (MaxPrivateElementSize == 0)
MaxPrivateElementSize = 4;
- return *this;
-}
+ if (LDSBankCount == 0)
+ LDSBankCount = 32;
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-namespace {
+ if (TT.getArch() == Triple::amdgcn) {
+ if (LocalMemorySize == 0)
+ LocalMemorySize = 32768;
-struct SIGISelActualAccessor : public GISelAccessor {
- std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
- std::unique_ptr<InstructionSelector> InstSelector;
- std::unique_ptr<LegalizerInfo> Legalizer;
- std::unique_ptr<RegisterBankInfo> RegBankInfo;
- const AMDGPUCallLowering *getCallLowering() const override {
- return CallLoweringInfo.get();
+ // Do something sensible for unspecified target.
+ if (!HasMovrel && !HasVGPRIndexMode)
+ HasMovrel = true;
}
- const InstructionSelector *getInstructionSelector() const override {
- return InstSelector.get();
- }
- const LegalizerInfo *getLegalizerInfo() const override {
- return Legalizer.get();
- }
- const RegisterBankInfo *getRegBankInfo() const override {
- return RegBankInfo.get();
- }
-};
-} // end anonymous namespace
-#endif
+ return *this;
+}
AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
const TargetMachine &TM)
@@ -111,7 +101,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
TargetTriple(TT),
Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
IsaVersion(ISAVersion0_0_0),
- WavefrontSize(64),
+ WavefrontSize(0),
LocalMemorySize(0),
LDSBankCount(0),
MaxPrivateElementSize(0),
@@ -125,6 +115,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
DX10Clamp(false),
FlatForGlobal(false),
AutoWaitcntBeforeBarrier(false),
+ CodeObjectV3(false),
UnalignedScratchAccess(false),
UnalignedBufferAccess(false),
@@ -135,6 +126,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
DebuggerReserveRegs(false),
DebuggerEmitPrologue(false),
+ EnableHugePrivateBuffer(false),
EnableVGPRSpilling(false),
EnablePromoteAlloca(false),
EnableLoadStoreOpt(false),
@@ -143,15 +135,17 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
DumpCode(false),
FP64(false),
+ FMA(false),
IsGCN(false),
- GCN1Encoding(false),
GCN3Encoding(false),
CIInsts(false),
GFX9Insts(false),
SGPRInitBug(false),
HasSMemRealTime(false),
Has16BitInsts(false),
+ HasIntClamp(false),
HasVOP3PInsts(false),
+ HasMadMixInsts(false),
HasMovrel(false),
HasVGPRIndexMode(false),
HasScalarStores(false),
@@ -167,6 +161,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
FlatInstOffsets(false),
FlatGlobalInsts(false),
FlatScratchInsts(false),
+ AddNoCarryInsts(false),
R600ALUInst(false),
CaymanISA(false),
@@ -203,14 +198,31 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
return NumWaves;
}
+std::pair<unsigned, unsigned>
+AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
+ switch (CC) {
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
+ case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_LS:
+ case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_ES:
+ case CallingConv::AMDGPU_GS:
+ case CallingConv::AMDGPU_PS:
+ return std::make_pair(1, getWavefrontSize());
+ default:
+ return std::make_pair(1, 16 * getWavefrontSize());
+ }
+}
+
std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
const Function &F) const {
+ // FIXME: 1024 if function.
// Default minimum/maximum flat work group sizes.
std::pair<unsigned, unsigned> Default =
- AMDGPU::isCompute(F.getCallingConv()) ?
- std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
- getWavefrontSize() * 4) :
- std::pair<unsigned, unsigned>(1, getWavefrontSize());
+ getDefaultFlatWorkGroupSize(F.getCallingConv());
// TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
// starts using "amdgpu-flat-work-group-size" attribute.
@@ -357,18 +369,12 @@ SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
: AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
TLInfo(TM, *this) {
-#ifndef LLVM_BUILD_GLOBAL_ISEL
- GISelAccessor *GISel = new GISelAccessor();
-#else
- SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
- GISel->CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
- GISel->Legalizer.reset(new AMDGPULegalizerInfo());
-
- GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
- GISel->InstSelector.reset(new AMDGPUInstructionSelector(
- *this, *static_cast<AMDGPURegisterBankInfo *>(GISel->RegBankInfo.get())));
-#endif
- setGISelAccessor(*GISel);
+ CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
+ Legalizer.reset(new AMDGPULegalizerInfo());
+
+ RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
+ InstSelector.reset(new AMDGPUInstructionSelector(
+ *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
}
void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
@@ -462,7 +468,7 @@ unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
}
unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
- const Function &F = *MF.getFunction();
+ const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
// Compute maximum number of SGPRs function can use using default/requested
@@ -512,7 +518,7 @@ unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
}
unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
- const Function &F = *MF.getFunction();
+ const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
// Compute maximum number of VGPRs function can use using default/requested
@@ -544,3 +550,59 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
return MaxNumVGPRs - getReservedNumVGPRs(MF);
}
+
+namespace {
+struct MemOpClusterMutation : ScheduleDAGMutation {
+ const SIInstrInfo *TII;
+
+ MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
+
+ void apply(ScheduleDAGInstrs *DAGInstrs) override {
+ ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
+ SUnit *SUa = nullptr;
+ // Search for two consequent memory operations and link them
+ // to prevent scheduler from moving them apart.
+ // In DAG pre-process SUnits are in the original order of
+ // the instructions before scheduling.
+ for (SUnit &SU : DAG->SUnits) {
+ MachineInstr &MI2 = *SU.getInstr();
+ if (!MI2.mayLoad() && !MI2.mayStore()) {
+ SUa = nullptr;
+ continue;
+ }
+ if (!SUa) {
+ SUa = &SU;
+ continue;
+ }
+
+ MachineInstr &MI1 = *SUa->getInstr();
+ if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
+ (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
+ (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
+ (TII->isDS(MI1) && TII->isDS(MI2))) {
+ SU.addPredBarrier(SUa);
+
+ for (const SDep &SI : SU.Preds) {
+ if (SI.getSUnit() != SUa)
+ SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
+ }
+
+ if (&SU != &DAG->ExitSU) {
+ for (const SDep &SI : SUa->Succs) {
+ if (SI.getSUnit() != &SU)
+ SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
+ }
+ }
+ }
+
+ SUa = &SU;
+ }
+ }
+};
+} // namespace
+
+void SISubtarget::getPostRAMutations(
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+ Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
+}
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index d4b6a5fe8020..cf4a691d4b58 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -16,6 +16,7 @@
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
#include "AMDGPU.h"
+#include "AMDGPUCallLowering.h"
#include "R600FrameLowering.h"
#include "R600ISelLowering.h"
#include "R600InstrInfo.h"
@@ -25,7 +26,9 @@
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/Triple.h"
-#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/MC/MCInstrItineraries.h"
@@ -63,16 +66,14 @@ public:
ISAVersion7_0_1,
ISAVersion7_0_2,
ISAVersion7_0_3,
+ ISAVersion7_0_4,
ISAVersion8_0_0,
ISAVersion8_0_1,
ISAVersion8_0_2,
ISAVersion8_0_3,
- ISAVersion8_0_4,
ISAVersion8_1_0,
ISAVersion9_0_0,
- ISAVersion9_0_1,
- ISAVersion9_0_2,
- ISAVersion9_0_3
+ ISAVersion9_0_2
};
enum TrapHandlerAbi {
@@ -116,6 +117,7 @@ protected:
bool DX10Clamp;
bool FlatForGlobal;
bool AutoWaitcntBeforeBarrier;
+ bool CodeObjectV3;
bool UnalignedScratchAccess;
bool UnalignedBufferAccess;
bool HasApertureRegs;
@@ -126,6 +128,7 @@ protected:
bool DebuggerEmitPrologue;
// Used as options.
+ bool EnableHugePrivateBuffer;
bool EnableVGPRSpilling;
bool EnablePromoteAlloca;
bool EnableLoadStoreOpt;
@@ -135,15 +138,17 @@ protected:
// Subtarget statically properties set by tablegen
bool FP64;
+ bool FMA;
bool IsGCN;
- bool GCN1Encoding;
bool GCN3Encoding;
bool CIInsts;
bool GFX9Insts;
bool SGPRInitBug;
bool HasSMemRealTime;
bool Has16BitInsts;
+ bool HasIntClamp;
bool HasVOP3PInsts;
+ bool HasMadMixInsts;
bool HasMovrel;
bool HasVGPRIndexMode;
bool HasScalarStores;
@@ -159,6 +164,7 @@ protected:
bool FlatInstOffsets;
bool FlatGlobalInsts;
bool FlatScratchInsts;
+ bool AddNoCarryInsts;
bool R600ALUInst;
bool CaymanISA;
bool CFALUBug;
@@ -210,6 +216,10 @@ public:
TargetTriple.getEnvironmentName() == "amdgizcl";
}
+ bool isAmdPalOS() const {
+ return TargetTriple.getOS() == Triple::AMDPAL;
+ }
+
Generation getGeneration() const {
return Gen;
}
@@ -218,6 +228,10 @@ public:
return WavefrontSize;
}
+ unsigned getWavefrontSizeLog2() const {
+ return Log2_32(WavefrontSize);
+ }
+
int getLocalMemorySize() const {
return LocalMemorySize;
}
@@ -238,11 +252,15 @@ public:
return Has16BitInsts;
}
+ bool hasIntClamp() const {
+ return HasIntClamp;
+ }
+
bool hasVOP3PInsts() const {
return HasVOP3PInsts;
}
- bool hasHWFP64() const {
+ bool hasFP64() const {
return FP64;
}
@@ -305,6 +323,18 @@ public:
return getGeneration() >= GFX9;
}
+ bool hasMadMixInsts() const {
+ return HasMadMixInsts;
+ }
+
+ bool hasSBufferLoadStoreAtomicDwordxN() const {
+ // Only use the "x1" variants on GFX9 or don't use the buffer variants.
+ // For x2 and higher variants, if the accessed region spans 2 VM pages and
+ // the second page is unmapped, the hw hangs.
+ // TODO: There is one future GFX9 chip that doesn't have this bug.
+ return getGeneration() != GFX9;
+ }
+
bool hasCARRY() const {
return (getGeneration() >= EVERGREEN);
}
@@ -317,10 +347,18 @@ public:
return CaymanISA;
}
+ bool hasFMA() const {
+ return FMA;
+ }
+
TrapHandlerAbi getTrapHandlerAbi() const {
return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
}
+ bool enableHugePrivateBuffer() const {
+ return EnableHugePrivateBuffer;
+ }
+
bool isPromoteAllocaEnabled() const {
return EnablePromoteAlloca;
}
@@ -344,7 +382,7 @@ public:
unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
- return getOccupancyWithLocalMemSize(MFI->getLDSSize(), *MF.getFunction());
+ return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
}
bool hasFP16Denormals() const {
@@ -372,17 +410,27 @@ public:
}
bool enableIEEEBit(const MachineFunction &MF) const {
- return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
+ return AMDGPU::isCompute(MF.getFunction().getCallingConv());
}
bool useFlatForGlobal() const {
return FlatForGlobal;
}
+ /// \returns If MUBUF instructions always perform range checking, even for
+ /// buffer resources used for private memory access.
+ bool privateMemoryResourceIsRangeChecked() const {
+ return getGeneration() < AMDGPUSubtarget::GFX9;
+ }
+
bool hasAutoWaitcntBeforeBarrier() const {
return AutoWaitcntBeforeBarrier;
}
+ bool hasCodeObjectV3() const {
+ return CodeObjectV3;
+ }
+
bool hasUnalignedBufferAccess() const {
return UnalignedBufferAccess;
}
@@ -419,19 +467,37 @@ public:
return FlatScratchInsts;
}
+ bool hasD16LoadStore() const {
+ return getGeneration() >= GFX9;
+ }
+
+ /// Return if most LDS instructions have an m0 use that require m0 to be
+ /// iniitalized.
+ bool ldsRequiresM0Init() const {
+ return getGeneration() < GFX9;
+ }
+
+ bool hasAddNoCarry() const {
+ return AddNoCarryInsts;
+ }
+
bool isMesaKernel(const MachineFunction &MF) const {
- return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
+ return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction().getCallingConv());
}
// Covers VS/PS/CS graphics shaders
bool isMesaGfxShader(const MachineFunction &MF) const {
- return isMesa3DOS() && AMDGPU::isShader(MF.getFunction()->getCallingConv());
+ return isMesa3DOS() && AMDGPU::isShader(MF.getFunction().getCallingConv());
}
bool isAmdCodeObjectV2(const MachineFunction &MF) const {
return isAmdHsaOS() || isMesaKernel(MF);
}
+ bool hasMad64_32() const {
+ return getGeneration() >= SEA_ISLANDS;
+ }
+
bool hasFminFmaxLegacy() const {
return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
}
@@ -558,6 +624,9 @@ public:
FlatWorkGroupSize);
}
+ /// \returns Default range flat work group size for a calling convention.
+ std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
+
/// \returns Subtarget's default pair of minimum/maximum flat work group sizes
/// for function \p F, or minimum/maximum flat work group sizes explicitly
/// requested using "amdgpu-flat-work-group-size" attribute attached to
@@ -626,7 +695,12 @@ private:
SIInstrInfo InstrInfo;
SIFrameLowering FrameLowering;
SITargetLowering TLInfo;
- std::unique_ptr<GISelAccessor> GISel;
+
+ /// GlobalISel related APIs.
+ std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
public:
SISubtarget(const Triple &TT, StringRef CPU, StringRef FS,
@@ -645,33 +719,25 @@ public:
}
const CallLowering *getCallLowering() const override {
- assert(GISel && "Access to GlobalISel APIs not set");
- return GISel->getCallLowering();
+ return CallLoweringInfo.get();
}
const InstructionSelector *getInstructionSelector() const override {
- assert(GISel && "Access to GlobalISel APIs not set");
- return GISel->getInstructionSelector();
+ return InstSelector.get();
}
const LegalizerInfo *getLegalizerInfo() const override {
- assert(GISel && "Access to GlobalISel APIs not set");
- return GISel->getLegalizerInfo();
+ return Legalizer.get();
}
const RegisterBankInfo *getRegBankInfo() const override {
- assert(GISel && "Access to GlobalISel APIs not set");
- return GISel->getRegBankInfo();
+ return RegBankInfo.get();
}
const SIRegisterInfo *getRegisterInfo() const override {
return &InstrInfo.getRegisterInfo();
}
- void setGISelAccessor(GISelAccessor &GISel) {
- this->GISel.reset(&GISel);
- }
-
// XXX - Why is this here if it isn't in the default pass set?
bool enableEarlyIfConversion() const override {
return true;
@@ -755,11 +821,16 @@ public:
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
- bool hasReadM0Hazard() const {
+ bool hasReadM0MovRelInterpHazard() const {
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
- unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const;
+ bool hasReadM0SendMsgHazard() const {
+ return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ }
+
+ unsigned getKernArgSegmentSize(const MachineFunction &MF,
+ unsigned ExplictArgBytes) const;
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
@@ -865,6 +936,10 @@ public:
/// subtarget's specifications, or does not meet number of waves per execution
/// unit requirement.
unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+
+ void getPostRAMutations(
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
+ const override;
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dc868f010d85..6984f4e71613 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -31,6 +31,7 @@
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
@@ -39,7 +40,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -117,10 +117,23 @@ static cl::opt<bool> EnableSIInsertWaitcntsPass(
cl::init(true));
// Option to run late CFG structurizer
-static cl::opt<bool> LateCFGStructurize(
+static cl::opt<bool, true> LateCFGStructurize(
"amdgpu-late-structurize",
cl::desc("Enable late CFG structurization"),
- cl::init(false),
+ cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
+ cl::Hidden);
+
+static cl::opt<bool> EnableAMDGPUFunctionCalls(
+ "amdgpu-function-calls",
+ cl::Hidden,
+ cl::desc("Enable AMDGPU function call support"),
+ cl::init(false));
+
+// Enable lib calls simplifications
+static cl::opt<bool> EnableLibCallSimplify(
+ "amdgpu-simplify-libcall",
+ cl::desc("Enable mdgpu library simplifications"),
+ cl::init(true),
cl::Hidden);
extern "C" void LLVMInitializeAMDGPUTarget() {
@@ -129,20 +142,29 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
PassRegistry *PR = PassRegistry::getPassRegistry();
+ initializeR600ClauseMergePassPass(*PR);
+ initializeR600ControlFlowFinalizerPass(*PR);
+ initializeR600PacketizerPass(*PR);
+ initializeR600ExpandSpecialInstrsPassPass(*PR);
+ initializeR600VectorRegMergerPass(*PR);
+ initializeAMDGPUDAGToDAGISelPass(*PR);
initializeSILowerI1CopiesPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFixVGPRCopiesPass(*PR);
initializeSIFoldOperandsPass(*PR);
initializeSIPeepholeSDWAPass(*PR);
initializeSIShrinkInstructionsPass(*PR);
- initializeSIFixControlFlowLiveIntervalsPass(*PR);
+ initializeSIOptimizeExecMaskingPreRAPass(*PR);
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
+ initializeAMDGPUArgumentUsageInfoPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
+ initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
+ initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
initializeSIInsertWaitsPass(*PR);
@@ -150,10 +172,15 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
initializeSIInsertSkipsPass(*PR);
+ initializeSIMemoryLegalizerPass(*PR);
initializeSIDebuggerInsertNopsPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
+ initializeSIFixWWMLivenessPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
+ initializeAMDGPUUseNativeCallsPass(*PR);
+ initializeAMDGPUSimplifyLibCallsPass(*PR);
+ initializeAMDGPUInlinerPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -192,6 +219,16 @@ static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
}
+static ScheduleDAGInstrs *
+createIterativeILPMachineScheduler(MachineSchedContext *C) {
+ auto DAG = new GCNIterativeScheduler(C,
+ GCNIterativeScheduler::SCHEDULE_ILP);
+ DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
+ return DAG;
+}
+
static MachineSchedRegistry
R600SchedRegistry("r600", "Run R600's custom scheduler",
createR600MachineScheduler);
@@ -215,9 +252,18 @@ GCNMinRegSchedRegistry("gcn-minreg",
"Run GCN iterative scheduler for minimal register usage (experimental)",
createMinRegScheduler);
+static MachineSchedRegistry
+GCNILPSchedRegistry("gcn-ilp",
+ "Run GCN iterative scheduler for ILP scheduling (experimental)",
+ createIterativeILPMachineScheduler);
+
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
+ if (TT.getEnvironmentName() == "amdgiz" ||
+ TT.getEnvironmentName() == "amdgizcl")
+ return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
}
@@ -239,9 +285,8 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
if (!GPU.empty())
return GPU;
- // HSA only supports CI+, so change the default GPU to a CI for HSA.
if (TT.getArch() == Triple::amdgcn)
- return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti";
+ return "generic";
return "r600";
}
@@ -252,21 +297,30 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
return Reloc::PIC_;
}
+static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
+ if (CM)
+ return *CM;
+ return CodeModel::Small;
+}
+
AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
TargetOptions Options,
Optional<Reloc::Model> RM,
- CodeModel::Model CM,
+ Optional<CodeModel::Model> CM,
CodeGenOpt::Level OptLevel)
- : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
- FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
- TLOF(createTLOF(getTargetTriple())) {
+ : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
+ FS, Options, getEffectiveRelocModel(RM),
+ getEffectiveCodeModel(CM), OptLevel),
+ TLOF(createTLOF(getTargetTriple())) {
AS = AMDGPU::getAMDGPUAS(TT);
initAsmInfo();
}
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
+bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
+
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
Attribute GPUAttr = F.getFnAttribute("target-cpu");
return GPUAttr.hasAttribute(Attribute::None) ?
@@ -288,15 +342,38 @@ static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
});
}
+/// Predicate for Internalize pass.
+static bool mustPreserveGV(const GlobalValue &GV) {
+ if (const Function *F = dyn_cast<Function>(&GV))
+ return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
+
+ return !GV.use_empty();
+}
+
void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
Builder.DivergentTarget = true;
- bool Internalize = InternalizeSymbols &&
- (getOptLevel() > CodeGenOpt::None) &&
- (getTargetTriple().getArch() == Triple::amdgcn);
- bool EarlyInline = EarlyInlineAll &&
- (getOptLevel() > CodeGenOpt::None);
- bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None;
+ bool EnableOpt = getOptLevel() > CodeGenOpt::None;
+ bool Internalize = InternalizeSymbols;
+ bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls;
+ bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
+ bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
+
+ if (EnableAMDGPUFunctionCalls) {
+ delete Builder.Inliner;
+ Builder.Inliner = createAMDGPUFunctionInliningPass();
+ }
+
+ if (Internalize) {
+ // If we're generating code, we always have the whole program available. The
+ // relocations expected for externally visible functions aren't supported,
+ // so make sure every non-entry function is hidden.
+ Builder.addExtension(
+ PassManagerBuilder::EP_EnabledOnOptLevel0,
+ [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+ PM.add(createInternalizePass(mustPreserveGV));
+ });
+ }
Builder.addExtension(
PassManagerBuilder::EP_ModuleOptimizerEarly,
@@ -308,38 +385,25 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
}
PM.add(createAMDGPUUnifyMetadataPass());
if (Internalize) {
- PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool {
- if (const Function *F = dyn_cast<Function>(&GV)) {
- if (F->isDeclaration())
- return true;
- switch (F->getCallingConv()) {
- default:
- return false;
- case CallingConv::AMDGPU_VS:
- case CallingConv::AMDGPU_HS:
- case CallingConv::AMDGPU_GS:
- case CallingConv::AMDGPU_PS:
- case CallingConv::AMDGPU_CS:
- case CallingConv::AMDGPU_KERNEL:
- case CallingConv::SPIR_KERNEL:
- return true;
- }
- }
- return !GV.use_empty();
- }));
+ PM.add(createInternalizePass(mustPreserveGV));
PM.add(createGlobalDCEPass());
}
if (EarlyInline)
PM.add(createAMDGPUAlwaysInlinePass(false));
});
+ const auto &Opt = Options;
Builder.addExtension(
PassManagerBuilder::EP_EarlyAsPossible,
- [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+ [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &,
+ legacy::PassManagerBase &PM) {
if (AMDGPUAA) {
PM.add(createAMDGPUAAWrapperPass());
PM.add(createAMDGPUExternalAAWrapperPass());
}
+ PM.add(llvm::createAMDGPUUseNativeCallsPass());
+ if (LibCallSimplify)
+ PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
});
Builder.addExtension(
@@ -359,8 +423,9 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
TargetOptions Options,
Optional<Reloc::Model> RM,
- CodeModel::Model CM, CodeGenOpt::Level OL)
- : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
+ Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT)
+ : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
setRequiresStructuredCFG(true);
}
@@ -392,8 +457,9 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
TargetOptions Options,
Optional<Reloc::Model> RM,
- CodeModel::Model CM, CodeGenOpt::Level OL)
- : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+ Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT)
+ : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
StringRef GPU = getGPUName(F);
@@ -464,6 +530,7 @@ public:
}
bool addPreISel() override;
+ bool addInstSelector() override;
void addPreRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
@@ -472,7 +539,12 @@ public:
class GCNPassConfig final : public AMDGPUPassConfig {
public:
GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
- : AMDGPUPassConfig(TM, PM) {}
+ : AMDGPUPassConfig(TM, PM) {
+ // It is necessary to know the register usage of the entire call graph. We
+ // allow calls without EnableAMDGPUFunctionCalls if they are marked
+ // noinline, so this is always required.
+ setRequiresCodeGenSCCOrder(true);
+ }
GCNTargetMachine &getGCNTargetMachine() const {
return getTM<GCNTargetMachine>();
@@ -485,12 +557,10 @@ public:
void addMachineSSAOptimization() override;
bool addILPOpts() override;
bool addInstSelector() override;
-#ifdef LLVM_BUILD_GLOBAL_ISEL
bool addIRTranslator() override;
bool addLegalizeMachineIR() override;
bool addRegBankSelect() override;
bool addGlobalInstructionSelect() override;
-#endif
void addFastRegAlloc(FunctionPass *RegAllocPass) override;
void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
void addPreRegAlloc() override;
@@ -540,15 +610,18 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAMDGPULowerIntrinsicsPass());
- // Function calls are not supported, so make sure we inline everything.
- addPass(createAMDGPUAlwaysInlinePass());
- addPass(createAlwaysInlinerLegacyPass());
- // We need to add the barrier noop pass, otherwise adding the function
- // inlining pass will cause all of the PassConfigs passes to be run
- // one function at a time, which means if we have a nodule with two
- // functions, then we will generate code for the first function
- // without ever running any passes on the second.
- addPass(createBarrierNoopPass());
+ if (TM.getTargetTriple().getArch() == Triple::r600 ||
+ !EnableAMDGPUFunctionCalls) {
+ // Function calls are not supported, so make sure we inline everything.
+ addPass(createAMDGPUAlwaysInlinePass());
+ addPass(createAlwaysInlinerLegacyPass());
+ // We need to add the barrier noop pass, otherwise adding the function
+ // inlining pass will cause all of the PassConfigs passes to be run
+ // one function at a time, which means if we have a nodule with two
+ // functions, then we will generate code for the first function
+ // without ever running any passes on the second.
+ addPass(createBarrierNoopPass());
+ }
if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
// TODO: May want to move later or split into an early and late one.
@@ -559,6 +632,9 @@ void AMDGPUPassConfig::addIRPasses() {
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+ // Replace OpenCL enqueued block function pointers with global variables.
+ addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
+
if (TM.getOptLevel() > CodeGenOpt::None) {
addPass(createInferAddressSpacesPass());
addPass(createAMDGPUPromoteAlloca());
@@ -609,7 +685,7 @@ bool AMDGPUPassConfig::addPreISel() {
}
bool AMDGPUPassConfig::addInstSelector() {
- addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
+ addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
return false;
}
@@ -630,6 +706,11 @@ bool R600PassConfig::addPreISel() {
return false;
}
+bool R600PassConfig::addInstSelector() {
+ addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
+ return false;
+}
+
void R600PassConfig::addPreRegAlloc() {
addPass(createR600VectorRegMerger());
}
@@ -725,7 +806,6 @@ bool GCNPassConfig::addInstSelector() {
return false;
}
-#ifdef LLVM_BUILD_GLOBAL_ISEL
bool GCNPassConfig::addIRTranslator() {
addPass(new IRTranslator());
return false;
@@ -746,8 +826,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
return false;
}
-#endif
-
void GCNPassConfig::addPreRegAlloc() {
if (LateCFGStructurize) {
addPass(createAMDGPUMachineCFGStructurizerPass());
@@ -764,19 +842,25 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+ // This must be run after SILowerControlFlow, since it needs to use the
+ // machine-level CFG, but before register allocation.
+ insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+
TargetPassConfig::addFastRegAlloc(RegAllocPass);
}
void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
- // This needs to be run directly before register allocation because earlier
- // passes might recompute live intervals.
- insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
+ insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+ // This must be run after SILowerControlFlow, since it needs to use the
+ // machine-level CFG, but before register allocation.
+ insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+
TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
}
@@ -806,6 +890,7 @@ void GCNPassConfig::addPreEmitPass() {
addPass(createSIInsertWaitsPass());
addPass(createSIShrinkInstructionsPass());
addPass(&SIInsertSkipsPassID);
+ addPass(createSIMemoryLegalizerPass());
addPass(createSIDebuggerInsertNopsPass());
addPass(&BranchRelaxationPassID);
}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index a3c7c1982d0a..5043e31f6f5b 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -41,9 +41,11 @@ protected:
StringRef getFeatureString(const Function &F) const;
public:
+ static bool EnableLateStructurizeCFG;
+
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, TargetOptions Options,
- Optional<Reloc::Model> RM, CodeModel::Model CM,
+ Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
CodeGenOpt::Level OL);
~AMDGPUTargetMachine() override;
@@ -82,8 +84,8 @@ private:
public:
R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, TargetOptions Options,
- Optional<Reloc::Model> RM, CodeModel::Model CM,
- CodeGenOpt::Level OL);
+ Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT);
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
@@ -105,12 +107,16 @@ private:
public:
GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, TargetOptions Options,
- Optional<Reloc::Model> RM, CodeModel::Model CM,
- CodeGenOpt::Level OL);
+ Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT);
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
const SISubtarget *getSubtargetImpl(const Function &) const override;
+
+ bool useIPRA() const override {
+ return true;
+ }
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index 6c1885e67fcb..e2f718bd3c34 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -23,8 +23,7 @@ using namespace llvm;
MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(
const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
- auto AS = static_cast<const AMDGPUTargetMachine*>(&TM)->getAMDGPUAS();
- if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO, AS) &&
+ if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO) &&
AMDGPU::shouldEmitConstantsToTextSection(TM.getTargetTriple()))
return TextSection;
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 89a03902dc69..77c2d4b956c6 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===//
+//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
//
// The LLVM Compiler Infrastructure
//
@@ -16,15 +16,40 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUTargetTransformInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/CostTable.h"
-#include "llvm/Target/TargetLowering.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <utility>
+
using namespace llvm;
#define DEBUG_TYPE "AMDGPUtti"
@@ -54,7 +79,7 @@ static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
if (!L->contains(I))
continue;
if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
- if (none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
+ if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
return SubLoop->contains(PHI); }))
return true;
} else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
@@ -66,7 +91,7 @@ static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
UP.Threshold = 300; // Twice the default.
- UP.MaxCount = UINT_MAX;
+ UP.MaxCount = std::numeric_limits<unsigned>::max();
UP.Partial = true;
// TODO: Do we want runtime unrolling?
@@ -81,12 +106,11 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
const DataLayout &DL = BB->getModule()->getDataLayout();
unsigned LocalGEPsSeen = 0;
- if (any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
+ if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
return SubLoop->contains(BB); }))
continue; // Block belongs to an inner loop.
for (const Instruction &I : *BB) {
-
// Unroll a loop which contains an "if" statement whose condition
// defined by a PHI belonging to the loop. This may help to eliminate
// if region and potentially even PHI itself, saving on both divergence
@@ -153,7 +177,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
if (!Inst || L->isLoopInvariant(Op))
continue;
- if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
+ if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
return SubLoop->contains(Inst); }))
continue;
HasLoopDef = true;
@@ -264,11 +288,36 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
return 8;
}
+bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+ MemIntrinsicInfo &Info) const {
+ switch (Inst->getIntrinsicID()) {
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec: {
+ auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
+ auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
+ if (!Ordering || !Volatile)
+ return false; // Invalid.
+
+ unsigned OrderingVal = Ordering->getZExtValue();
+ if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
+ return false;
+
+ Info.PtrVal = Inst->getArgOperand(0);
+ Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
+ Info.ReadMem = true;
+ Info.WriteMem = true;
+ Info.IsVolatile = !Volatile->isNullValue();
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
int AMDGPUTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
-
EVT OrigTy = TLI->getValueType(DL, Ty);
if (!OrigTy.isSimple()) {
return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
@@ -289,25 +338,23 @@ int AMDGPUTTIImpl::getArithmeticInstrCost(
switch (ISD) {
case ISD::SHL:
case ISD::SRL:
- case ISD::SRA: {
+ case ISD::SRA:
if (SLT == MVT::i64)
return get64BitInstrCost() * LT.first * NElts;
// i32
return getFullRateInstrCost() * LT.first * NElts;
- }
case ISD::ADD:
case ISD::SUB:
case ISD::AND:
case ISD::OR:
- case ISD::XOR: {
+ case ISD::XOR:
if (SLT == MVT::i64){
// and, or and xor are typically split into 2 VALU instructions.
return 2 * getFullRateInstrCost() * LT.first * NElts;
}
return LT.first * NElts * getFullRateInstrCost();
- }
case ISD::MUL: {
const int QuarterRateCost = getQuarterRateInstrCost();
if (SLT == MVT::i64) {
@@ -327,14 +374,12 @@ int AMDGPUTTIImpl::getArithmeticInstrCost(
if (SLT == MVT::f32 || SLT == MVT::f16)
return LT.first * NElts * getFullRateInstrCost();
break;
-
case ISD::FDIV:
case ISD::FREM:
// FIXME: frem should be handled separately. The fdiv in it is most of it,
// but the current lowering is also not entirely correct.
if (SLT == MVT::f64) {
int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
-
// Add cost of workaround.
if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
Cost += 3 * getFullRateInstrCost();
@@ -342,13 +387,34 @@ int AMDGPUTTIImpl::getArithmeticInstrCost(
return LT.first * Cost * NElts;
}
- // Assuming no fp32 denormals lowering.
+ if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
+ // TODO: This is more complicated, unsafe flags etc.
+ if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) ||
+ (SLT == MVT::f16 && ST->has16BitInsts())) {
+ return LT.first * getQuarterRateInstrCost() * NElts;
+ }
+ }
+
+ if (SLT == MVT::f16 && ST->has16BitInsts()) {
+ // 2 x v_cvt_f32_f16
+ // f32 rcp
+ // f32 fmul
+ // v_cvt_f16_f32
+ // f16 div_fixup
+ int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
+ return LT.first * Cost * NElts;
+ }
+
if (SLT == MVT::f32 || SLT == MVT::f16) {
- assert(!ST->hasFP32Denormals() && "will change when supported");
int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
+
+ if (!ST->hasFP32Denormals()) {
+ // FP mode switches.
+ Cost += 2 * getFullRateInstrCost();
+ }
+
return LT.first * NElts * Cost;
}
-
break;
default:
break;
@@ -451,7 +517,9 @@ static bool isArgPassedInSGPR(const Argument *A) {
case CallingConv::SPIR_KERNEL:
return true;
case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_LS:
case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
@@ -465,11 +533,9 @@ static bool isArgPassedInSGPR(const Argument *A) {
}
}
-///
/// \returns true if the result of the value could potentially be
/// different across workitems in a wavefront.
bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
-
if (const Argument *A = dyn_cast<Argument>(V))
return !isArgPassedInSGPR(A);
@@ -534,3 +600,16 @@ unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Inde
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
+
+bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller,
+ const Function *Callee) const {
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+ const FeatureBitset &CallerBits =
+ TM.getSubtargetImpl(*Caller)->getFeatureBits();
+ const FeatureBitset &CalleeBits =
+ TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+ FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
+ FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
+ return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 9a320bdfcc3d..8899d2c6da8a 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===//
+//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,38 +6,76 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
+//
/// \file
/// This file a TargetTransformInfo::Concept conforming object specific to the
/// AMDGPU target machine. It uses the target's detailed information to
/// provide more precise answers to certain TTI queries, while letting the
/// target independent and default TTI implementations handle the rest.
-///
+//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
namespace llvm {
+
class AMDGPUTargetLowering;
+class Loop;
+class ScalarEvolution;
+class Type;
+class Value;
class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
- typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT;
- typedef TargetTransformInfo TTI;
+ using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
+ using TTI = TargetTransformInfo;
+
friend BaseT;
const AMDGPUSubtarget *ST;
const AMDGPUTargetLowering *TLI;
bool IsGraphicsShader;
+ const FeatureBitset InlineFeatureIgnoreList = {
+ // Codegen control options which don't matter.
+ AMDGPU::FeatureEnableLoadStoreOpt,
+ AMDGPU::FeatureEnableSIScheduler,
+ AMDGPU::FeatureEnableUnsafeDSOffsetFolding,
+ AMDGPU::FeatureFlatForGlobal,
+ AMDGPU::FeaturePromoteAlloca,
+ AMDGPU::FeatureUnalignedBufferAccess,
+ AMDGPU::FeatureUnalignedScratchAccess,
+
+ AMDGPU::FeatureAutoWaitcntBeforeBarrier,
+ AMDGPU::FeatureDebuggerEmitPrologue,
+ AMDGPU::FeatureDebuggerInsertNops,
+ AMDGPU::FeatureDebuggerReserveRegs,
+
+ // Property of the kernel/environment which can't actually differ.
+ AMDGPU::FeatureSGPRInitBug,
+ AMDGPU::FeatureXNACK,
+ AMDGPU::FeatureTrapHandler,
+
+ // Perf-tuning features
+ AMDGPU::FeatureFastFMAF32,
+ AMDGPU::HalfRate64Ops
+ };
+
const AMDGPUSubtarget *getST() const { return ST; }
const AMDGPUTargetLowering *getTLI() const { return TLI; }
-
static inline int getFullRateInstrCost() {
return TargetTransformInfo::TCC_Basic;
}
@@ -78,7 +116,7 @@ public:
unsigned getHardwareNumberOfRegisters(bool Vector) const;
unsigned getNumberOfRegisters(bool Vector) const;
- unsigned getRegisterBitWidth(bool Vector) const ;
+ unsigned getRegisterBitWidth(bool Vector) const;
unsigned getMinVectorRegisterBitWidth() const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
@@ -94,6 +132,8 @@ public:
unsigned getMaxInterleaveFactor(unsigned VF);
+ bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
+
int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
@@ -121,8 +161,13 @@ public:
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp);
+
+ bool areInlineCompatible(const Function *Caller,
+ const Function *Callee) const;
+
+ unsigned getInliningThresholdMultiplier() { return 9; }
};
} // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 309913f87fb6..6107f3a7dd18 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -21,18 +21,26 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
+
using namespace llvm;
#define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
@@ -42,6 +50,7 @@ namespace {
class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
public:
static char ID; // Pass identification, replacement for typeid
+
AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
}
@@ -51,9 +60,12 @@ public:
bool runOnFunction(Function &F) override;
};
-}
+} // end anonymous namespace
char AMDGPUUnifyDivergentExitNodes::ID = 0;
+
+char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
+
INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
"Unify divergent function exit nodes", false, false)
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
@@ -61,8 +73,6 @@ INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
"Unify divergent function exit nodes", false, false)
-char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
-
void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
// TODO: Preserve dominator tree.
AU.addRequired<PostDominatorTreeWrapperPass>();
@@ -113,7 +123,6 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
// Otherwise, we need to insert a new basic block into the function, add a PHI
// nodes (if the function returns values), and convert all of the return
// instructions into unconditional branches.
- //
BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
PHINode *PN = nullptr;
@@ -129,7 +138,6 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
// Loop over all of the blocks, replacing the return instruction with an
// unconditional branch.
- //
for (BasicBlock *BB : ReturningBlocks) {
// Add an incoming element to the PHI node for every return instruction that
// is merging into this new block...
@@ -142,7 +150,7 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
for (BasicBlock *BB : ReturningBlocks) {
// Cleanup possible branch to unconditional branch to the return.
- SimplifyCFG(BB, TTI, 2);
+ simplifyCFG(BB, TTI, {2});
}
return NewRetBlock;
@@ -157,7 +165,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
// Loop over all of the blocks in a function, tracking all of the blocks that
// return.
- //
SmallVector<BasicBlock *, 4> ReturningBlocks;
SmallVector<BasicBlock *, 4> UnreachableBlocks;
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
index 3a0c3ede08f4..b78568e89cfb 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata -------------------===//
+//===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -16,7 +16,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include <algorithm>
@@ -41,10 +41,11 @@ namespace {
class AMDGPUUnifyMetadata : public ModulePass {
public:
static char ID;
- explicit AMDGPUUnifyMetadata() : ModulePass(ID) {};
+
+ explicit AMDGPUUnifyMetadata() : ModulePass(ID) {}
private:
- virtual bool runOnModule(Module &M);
+ bool runOnModule(Module &M) override;
/// \brief Unify version metadata.
/// \return true if changes are made.
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 1a393845a822..0a0e43123ae0 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -1,11 +1,10 @@
-//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
+//===- AMDILCFGStructurizer.cpp - CFG Structurizer ------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
-/// \file
//==-----------------------------------------------------------------------===//
#include "AMDGPU.h"
@@ -67,7 +66,7 @@ STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions");
namespace llvm {
- void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
+void initializeAMDGPUCFGStructurizerPass(PassRegistry &);
} // end namespace llvm
@@ -121,9 +120,9 @@ public:
class AMDGPUCFGStructurizer : public MachineFunctionPass {
public:
- typedef SmallVector<MachineBasicBlock *, 32> MBBVector;
- typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap;
- typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap;
+ using MBBVector = SmallVector<MachineBasicBlock *, 32>;
+ using MBBInfoMap = std::map<MachineBasicBlock *, BlockInformation *>;
+ using LoopLandInfoMap = std::map<MachineLoop *, MachineBasicBlock *>;
enum PathToKind {
Not_SinglePath = 0,
@@ -234,6 +233,7 @@ protected:
void insertCondBranchBefore(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I, int NewOpcode,
int RegNum, const DebugLoc &DL);
+
static int getBranchNzeroOpcode(int OldOpcode);
static int getBranchZeroOpcode(int OldOpcode);
static int getContinueNzeroOpcode(int OldOpcode);
@@ -246,21 +246,25 @@ protected:
static bool isUncondBranch(MachineInstr *MI);
static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
+
/// The correct naming for this is getPossibleLoopendBlockBranchInstr.
///
/// BB with backward-edge could have move instructions after the branch
/// instruction. Such move instruction "belong to" the loop backward-edge.
MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
+
static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
static bool isReturnBlock(MachineBasicBlock *MBB);
static void cloneSuccessorList(MachineBasicBlock *DstMBB,
- MachineBasicBlock *SrcMBB) ;
+ MachineBasicBlock *SrcMBB);
static MachineBasicBlock *clone(MachineBasicBlock *MBB);
+
/// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
/// because the AMDGPU instruction is not recognized as terminator fix this
/// and retire this routine
void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
+
static void wrapup(MachineBasicBlock *MBB);
int patternMatch(MachineBasicBlock *MBB);
@@ -299,6 +303,7 @@ protected:
MachineBasicBlock *LandMBB);
void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
MachineBasicBlock *ContMBB);
+
/// normalizeInfiniteLoopExit change
/// B1:
/// uncond_br LoopHeader
@@ -309,6 +314,7 @@ protected:
/// and return the newly added dummy exit block
MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
void removeUnconditionalBranch(MachineBasicBlock *MBB);
+
/// Remove duplicate branches instructions in a block.
/// For instance
/// B0:
@@ -318,6 +324,7 @@ protected:
/// B0:
/// cond_br X B1 B2
void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
+
void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
void removeSuccessor(MachineBasicBlock *MBB);
MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
@@ -335,10 +342,10 @@ private:
SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
};
-char AMDGPUCFGStructurizer::ID = 0;
-
} // end anonymous namespace
+char AMDGPUCFGStructurizer::ID = 0;
+
int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
if (It == BlockInfoMap.end())
@@ -535,7 +542,7 @@ int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
case AMDGPU::JUMP_COND:
case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
default: llvm_unreachable("internal error");
- };
+ }
return -1;
}
@@ -1168,6 +1175,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
return Num;
}
+#ifndef NDEBUG
void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
@@ -1209,6 +1217,7 @@ void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
dbgs() << "\n";
}
+#endif
int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
@@ -1595,7 +1604,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
if (!BranchMI) {
DEBUG(
- dbgs() << "migrateInstruction don't see branch instr\n" ;
+ dbgs() << "migrateInstruction don't see branch instr\n";
);
SpliceEnd = SrcMBB->end();
} else {
@@ -1632,7 +1641,7 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
FuncRep->push_back(DummyExitBlk); //insert to function
SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
- LLVMContext &Ctx = LoopHeader->getParent()->getFunction()->getContext();
+ LLVMContext &Ctx = LoopHeader->getParent()->getFunction().getContext();
Ctx.emitError("Extra register needed to handle CFG");
return nullptr;
}
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b37c274102bc..2acd7f78faea 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ---------===//
+//===- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,6 +7,7 @@
//
//===----------------------------------------------------------------------===//
+#include "AMDGPU.h"
#include "AMDKernelCodeT.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
@@ -40,7 +41,9 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/SMLoc.h"
@@ -83,7 +86,7 @@ public:
AMDGPUOperand(KindTy Kind_, const AMDGPUAsmParser *AsmParser_)
: MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {}
- typedef std::unique_ptr<AMDGPUOperand> Ptr;
+ using Ptr = std::unique_ptr<AMDGPUOperand>;
struct Modifiers {
bool Abs = false;
@@ -129,6 +132,7 @@ public:
ImmTyIdxen,
ImmTyAddr64,
ImmTyOffset,
+ ImmTyInstOffset,
ImmTyOffset0,
ImmTyOffset1,
ImmTyGLC,
@@ -164,7 +168,8 @@ public:
ImmTyOpSelHi,
ImmTyNegLo,
ImmTyNegHi,
- ImmTySwizzle
+ ImmTySwizzle,
+ ImmTyHigh
};
struct TokOp {
@@ -290,8 +295,8 @@ public:
bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); }
bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
- bool isOffsetU12() const { return isImmTy(ImmTyOffset) && isUInt<12>(getImm()); }
- bool isOffsetS13() const { return isImmTy(ImmTyOffset) && isInt<13>(getImm()); }
+ bool isOffsetU12() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isUInt<12>(getImm()); }
+ bool isOffsetS13() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isInt<13>(getImm()); }
bool isGDS() const { return isImmTy(ImmTyGDS); }
bool isGLC() const { return isImmTy(ImmTyGLC); }
bool isSLC() const { return isImmTy(ImmTySLC); }
@@ -312,6 +317,7 @@ public:
bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); }
bool isNegLo() const { return isImmTy(ImmTyNegLo); }
bool isNegHi() const { return isImmTy(ImmTyNegHi); }
+ bool isHigh() const { return isImmTy(ImmTyHigh); }
bool isMod() const {
return isClampSI() || isOModSI();
@@ -637,6 +643,7 @@ public:
case ImmTyIdxen: OS << "Idxen"; break;
case ImmTyAddr64: OS << "Addr64"; break;
case ImmTyOffset: OS << "Offset"; break;
+ case ImmTyInstOffset: OS << "InstOffset"; break;
case ImmTyOffset0: OS << "Offset0"; break;
case ImmTyOffset1: OS << "Offset1"; break;
case ImmTyGLC: OS << "GLC"; break;
@@ -673,6 +680,7 @@ public:
case ImmTyNegLo: OS << "NegLo"; break;
case ImmTyNegHi: OS << "NegHi"; break;
case ImmTySwizzle: OS << "Swizzle"; break;
+ case ImmTyHigh: OS << "High"; break;
}
}
@@ -801,7 +809,6 @@ public:
};
class AMDGPUAsmParser : public MCTargetAsmParser {
- const MCInstrInfo &MII;
MCAsmParser &Parser;
unsigned ForcedEncodingSize = 0;
@@ -822,11 +829,15 @@ private:
bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
bool ParseDirectiveHSACodeObjectVersion();
bool ParseDirectiveHSACodeObjectISA();
- bool ParseDirectiveCodeObjectMetadata();
bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
bool ParseDirectiveAMDKernelCodeT();
bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const;
bool ParseDirectiveAMDGPUHsaKernel();
+
+ bool ParseDirectiveISAVersion();
+ bool ParseDirectiveHSAMetadata();
+ bool ParseDirectivePALMetadata();
+
bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
RegisterKind RegKind, unsigned Reg1,
unsigned RegNum);
@@ -843,12 +854,12 @@ public:
Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
};
- typedef std::map<AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap;
+ using OptionalImmIndexMap = std::map<AMDGPUOperand::ImmTy, unsigned>;
AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser,
const MCInstrInfo &MII,
const MCTargetOptions &Options)
- : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser) {
+ : MCTargetAsmParser(Options, STI, MII), Parser(_Parser) {
MCAsmParserExtension::Initialize(Parser);
if (getFeatureBits().none()) {
@@ -905,6 +916,10 @@ public:
return !isVI();
}
+ bool hasIntClamp() const {
+ return getFeatureBits()[AMDGPU::FeatureIntClamp];
+ }
+
AMDGPUTargetStreamer &getTargetStreamer() {
MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
return static_cast<AMDGPUTargetStreamer &>(TS);
@@ -991,8 +1006,9 @@ public:
private:
struct OperandInfoTy {
int64_t Id;
- bool IsSymbolic;
- OperandInfoTy(int64_t Id_) : Id(Id_), IsSymbolic(false) { }
+ bool IsSymbolic = false;
+
+ OperandInfoTy(int64_t Id_) : Id(Id_) {}
};
bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId);
@@ -1004,6 +1020,7 @@ private:
bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc);
bool validateConstantBusLimitations(const MCInst &Inst);
bool validateEarlyClobberLimitations(const MCInst &Inst);
+ bool validateIntClampSupported(const MCInst &Inst);
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
@@ -1060,9 +1077,12 @@ public:
void cvtVOP3(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx);
+ void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
+ void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands);
+
void cvtMIMG(MCInst &Inst, const OperandVector &Operands,
bool IsAtomic = false);
void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
@@ -1279,7 +1299,6 @@ uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
}
void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
-
if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()),
Inst.getNumOperands())) {
addLiteralImmOperand(Inst, Imm.Val,
@@ -1311,7 +1330,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
- case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
@@ -1335,7 +1354,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
// unclear how we should encode them. This case should be checked earlier
// in predicate methods (isLiteralImm())
llvm_unreachable("fp literal in 64-bit integer instruction.");
- }
+
case AMDGPU::OPERAND_REG_IMM_INT32:
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
@@ -1377,7 +1396,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_IMM_INT32:
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
- case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
if (isInt<32>(Val) &&
AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
AsmParser->hasInv2PiInlineImm())) {
@@ -1387,11 +1406,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
Inst.addOperand(MCOperand::createImm(Val & 0xffffffff));
return;
- }
+
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
- case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Val));
return;
@@ -1399,11 +1418,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
return;
- }
+
case AMDGPU::OPERAND_REG_IMM_INT16:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
if (isInt<16>(Val) &&
AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
AsmParser->hasInv2PiInlineImm())) {
@@ -1413,7 +1432,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
Inst.addOperand(MCOperand::createImm(Val & 0xffff));
return;
- }
+
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
auto LiteralVal = static_cast<uint16_t>(Literal.getLoBits(16).getZExtValue());
@@ -1711,7 +1730,6 @@ AMDGPUAsmParser::parseAbsoluteExpr(int64_t &Val, bool AbsMod) {
if (AbsMod && getLexer().peekTok().is(AsmToken::Pipe) &&
(getLexer().getKind() == AsmToken::Integer ||
getLexer().getKind() == AsmToken::Real)) {
-
// This is a workaround for handling operands like these:
// |1.0|
// |-1|
@@ -2111,7 +2129,6 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
SIInstrFlags::VOP1 | SIInstrFlags::VOP2 |
SIInstrFlags::VOP3 | SIInstrFlags::VOP3P |
SIInstrFlags::SDWA)) {
-
// Check special imm operands (used by madmk, etc)
if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) {
++ConstantBusUseCount;
@@ -2156,7 +2173,6 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
}
bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
-
const unsigned Opcode = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opcode);
@@ -2193,6 +2209,20 @@ bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
return true;
}
+bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) {
+
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::IntClamp) != 0 && !hasIntClamp()) {
+ int ClampIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp);
+ assert(ClampIdx != -1);
+ return Inst.getOperand(ClampIdx).getImm() == 0;
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
const SMLoc &IDLoc) {
if (!validateConstantBusLimitations(Inst)) {
@@ -2205,6 +2235,11 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"destination must be different than all sources");
return false;
}
+ if (!validateIntClampSupported(Inst)) {
+ Error(IDLoc,
+ "integer clamping is not supported on this GPU");
+ return false;
+ }
return true;
}
@@ -2365,49 +2400,6 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
return false;
}
-bool AMDGPUAsmParser::ParseDirectiveCodeObjectMetadata() {
- std::string YamlString;
- raw_string_ostream YamlStream(YamlString);
-
- getLexer().setSkipSpace(false);
-
- bool FoundEnd = false;
- while (!getLexer().is(AsmToken::Eof)) {
- while (getLexer().is(AsmToken::Space)) {
- YamlStream << getLexer().getTok().getString();
- Lex();
- }
-
- if (getLexer().is(AsmToken::Identifier)) {
- StringRef ID = getLexer().getTok().getIdentifier();
- if (ID == AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd) {
- Lex();
- FoundEnd = true;
- break;
- }
- }
-
- YamlStream << Parser.parseStringToEndOfStatement()
- << getContext().getAsmInfo()->getSeparatorString();
-
- Parser.eatToEndOfStatement();
- }
-
- getLexer().setSkipSpace(true);
-
- if (getLexer().is(AsmToken::Eof) && !FoundEnd) {
- return TokError(
- "expected directive .end_amdgpu_code_object_metadata not found");
- }
-
- YamlStream.flush();
-
- if (!getTargetStreamer().EmitCodeObjectMetadata(YamlString))
- return Error(getParser().getTok().getLoc(), "invalid code object metadata");
-
- return false;
-}
-
bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
amd_kernel_code_t &Header) {
SmallString<40> ErrStr;
@@ -2460,6 +2452,103 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
return false;
}
+bool AMDGPUAsmParser::ParseDirectiveISAVersion() {
+ if (getSTI().getTargetTriple().getArch() != Triple::amdgcn) {
+ return Error(getParser().getTok().getLoc(),
+ ".amd_amdgpu_isa directive is not available on non-amdgcn "
+ "architectures");
+ }
+
+ auto ISAVersionStringFromASM = getLexer().getTok().getStringContents();
+
+ std::string ISAVersionStringFromSTI;
+ raw_string_ostream ISAVersionStreamFromSTI(ISAVersionStringFromSTI);
+ IsaInfo::streamIsaVersion(&getSTI(), ISAVersionStreamFromSTI);
+
+ if (ISAVersionStringFromASM != ISAVersionStreamFromSTI.str()) {
+ return Error(getParser().getTok().getLoc(),
+ ".amd_amdgpu_isa directive does not match triple and/or mcpu "
+ "arguments specified through the command line");
+ }
+
+ getTargetStreamer().EmitISAVersion(ISAVersionStreamFromSTI.str());
+ Lex();
+
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
+ if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) {
+ return Error(getParser().getTok().getLoc(),
+ (Twine(HSAMD::AssemblerDirectiveBegin) + Twine(" directive is "
+ "not available on non-amdhsa OSes")).str());
+ }
+
+ std::string HSAMetadataString;
+ raw_string_ostream YamlStream(HSAMetadataString);
+
+ getLexer().setSkipSpace(false);
+
+ bool FoundEnd = false;
+ while (!getLexer().is(AsmToken::Eof)) {
+ while (getLexer().is(AsmToken::Space)) {
+ YamlStream << getLexer().getTok().getString();
+ Lex();
+ }
+
+ if (getLexer().is(AsmToken::Identifier)) {
+ StringRef ID = getLexer().getTok().getIdentifier();
+ if (ID == AMDGPU::HSAMD::AssemblerDirectiveEnd) {
+ Lex();
+ FoundEnd = true;
+ break;
+ }
+ }
+
+ YamlStream << Parser.parseStringToEndOfStatement()
+ << getContext().getAsmInfo()->getSeparatorString();
+
+ Parser.eatToEndOfStatement();
+ }
+
+ getLexer().setSkipSpace(true);
+
+ if (getLexer().is(AsmToken::Eof) && !FoundEnd) {
+ return TokError(Twine("expected directive ") +
+ Twine(HSAMD::AssemblerDirectiveEnd) + Twine(" not found"));
+ }
+
+ YamlStream.flush();
+
+ if (!getTargetStreamer().EmitHSAMetadata(HSAMetadataString))
+ return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectivePALMetadata() {
+ if (getSTI().getTargetTriple().getOS() != Triple::AMDPAL) {
+ return Error(getParser().getTok().getLoc(),
+ (Twine(PALMD::AssemblerDirective) + Twine(" directive is "
+ "not available on non-amdpal OSes")).str());
+ }
+
+ PALMD::Metadata PALMetadata;
+ for (;;) {
+ uint32_t Value;
+ if (ParseAsAbsoluteExpression(Value)) {
+ return TokError(Twine("invalid value in ") +
+ Twine(PALMD::AssemblerDirective));
+ }
+ PALMetadata.push_back(Value);
+ if (getLexer().isNot(AsmToken::Comma))
+ break;
+ Lex();
+ }
+ getTargetStreamer().EmitPALMetadata(PALMetadata);
+ return false;
+}
+
bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getString();
@@ -2469,20 +2558,45 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
if (IDVal == ".hsa_code_object_isa")
return ParseDirectiveHSACodeObjectISA();
- if (IDVal == AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin)
- return ParseDirectiveCodeObjectMetadata();
-
if (IDVal == ".amd_kernel_code_t")
return ParseDirectiveAMDKernelCodeT();
if (IDVal == ".amdgpu_hsa_kernel")
return ParseDirectiveAMDGPUHsaKernel();
+ if (IDVal == ".amd_amdgpu_isa")
+ return ParseDirectiveISAVersion();
+
+ if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin)
+ return ParseDirectiveHSAMetadata();
+
+ if (IDVal == PALMD::AssemblerDirective)
+ return ParseDirectivePALMetadata();
+
return true;
}
bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
unsigned RegNo) const {
+
+ for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true);
+ R.isValid(); ++R) {
+ if (*R == RegNo)
+ return isGFX9();
+ }
+
+ switch (RegNo) {
+ case AMDGPU::TBA:
+ case AMDGPU::TBA_LO:
+ case AMDGPU::TBA_HI:
+ case AMDGPU::TMA:
+ case AMDGPU::TMA_LO:
+ case AMDGPU::TMA_HI:
+ return !isGFX9();
+ default:
+ break;
+ }
+
if (isCI())
return true;
@@ -2529,24 +2643,22 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
if (ResTy == MatchOperand_Success)
return ResTy;
- if (getLexer().getKind() == AsmToken::Identifier) {
- // If this identifier is a symbol, we want to create an expression for it.
- // It is a little difficult to distinguish between a symbol name, and
- // an instruction flag like 'gds'. In order to do this, we parse
- // all tokens as expressions and then treate the symbol name as the token
- // string when we want to interpret the operand as a token.
- const auto &Tok = Parser.getTok();
- SMLoc S = Tok.getLoc();
- const MCExpr *Expr = nullptr;
- if (!Parser.parseExpression(Expr)) {
- Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
- return MatchOperand_Success;
- }
+ const auto &Tok = Parser.getTok();
+ SMLoc S = Tok.getLoc();
+
+ const MCExpr *Expr = nullptr;
+ if (!Parser.parseExpression(Expr)) {
+ Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
+ return MatchOperand_Success;
+ }
- Operands.push_back(AMDGPUOperand::CreateToken(this, Tok.getString(), Tok.getLoc()));
+ // Possibly this is an instruction flag like 'gds'.
+ if (Tok.getKind() == AsmToken::Identifier) {
+ Operands.push_back(AMDGPUOperand::CreateToken(this, Tok.getString(), S));
Parser.Lex();
return MatchOperand_Success;
}
+
return MatchOperand_NoMatch;
}
@@ -2688,7 +2800,7 @@ OperandMatchResultTy AMDGPUAsmParser::parseOperandArrayWithPrefix(
// FIXME: How to verify the number of elements matches the number of src
// operands?
- for (int I = 0; I < 3; ++I) {
+ for (int I = 0; I < 4; ++I) {
if (I != 0) {
if (getLexer().is(AsmToken::RBrac))
break;
@@ -4016,11 +4128,13 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
{"gds", AMDGPUOperand::ImmTyGDS, true, nullptr},
{"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
+ {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
{"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr},
{"nfmt", AMDGPUOperand::ImmTyNFMT, false, nullptr},
{"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
{"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
+ {"high", AMDGPUOperand::ImmTyHigh, true, nullptr},
{"clamp", AMDGPUOperand::ImmTyClampSI, true, nullptr},
{"omod", AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul},
{"unorm", AMDGPUOperand::ImmTyUNorm, true, nullptr},
@@ -4088,6 +4202,30 @@ OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands)
return MatchOperand_NoMatch;
}
+void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands) {
+ cvtVOP3P(Inst, Operands);
+
+ int Opc = Inst.getOpcode();
+
+ int SrcNum;
+ const int Ops[] = { AMDGPU::OpName::src0,
+ AMDGPU::OpName::src1,
+ AMDGPU::OpName::src2 };
+ for (SrcNum = 0;
+ SrcNum < 3 && AMDGPU::getNamedOperandIdx(Opc, Ops[SrcNum]) != -1;
+ ++SrcNum);
+ assert(SrcNum > 0);
+
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+
+ if ((OpSel & (1 << SrcNum)) != 0) {
+ int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
+ Inst.getOperand(ModIdx).setImm(ModVal | SISrcMods::DST_OP_SEL);
+ }
+}
+
static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
// 1. This operand is input modifiers
return Desc.OpInfo[OpNum].OperandType == AMDGPU::OPERAND_INPUT_MODS
@@ -4099,6 +4237,45 @@ static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
&& Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1;
}
+void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands)
+{
+ OptionalImmIndexMap OptionalIdx;
+ unsigned Opc = Inst.getOpcode();
+
+ unsigned I = 1;
+ const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+ for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+ ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+ }
+
+ for (unsigned E = Operands.size(); I != E; ++I) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+ if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+ } else if (Op.isInterpSlot() ||
+ Op.isInterpAttr() ||
+ Op.isAttrChan()) {
+ Inst.addOperand(MCOperand::createImm(Op.Imm.Val));
+ } else if (Op.isImmModifier()) {
+ OptionalIdx[Op.getImmTy()] = I;
+ } else {
+ llvm_unreachable("unhandled operand type");
+ }
+ }
+
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::high) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyHigh);
+ }
+
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
+ }
+
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+ }
+}
+
void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx) {
unsigned Opc = Inst.getOpcode();
@@ -4162,20 +4339,36 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
cvtVOP3(Inst, Operands, OptionalIdx);
}
-void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
+void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst,
+ const OperandVector &Operands) {
OptionalImmIndexMap OptIdx;
+ const int Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
cvtVOP3(Inst, Operands, OptIdx);
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) {
+ assert(!IsPacked);
+ Inst.addOperand(Inst.getOperand(0));
+ }
+
// FIXME: This is messy. Parse the modifiers as if it was a normal VOP3
// instruction, and then figure out where to actually put the modifiers
- int Opc = Inst.getOpcode();
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel);
- addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSelHi, -1);
+
+ int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi);
+ if (OpSelHiIdx != -1) {
+ int DefaultVal = IsPacked ? -1 : 0;
+ addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSelHi,
+ DefaultVal);
+ }
int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
if (NegLoIdx != -1) {
+ assert(IsPacked);
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi);
}
@@ -4188,13 +4381,16 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
AMDGPU::OpName::src2_modifiers };
int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
- int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi);
unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
- unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
+ unsigned OpSelHi = 0;
unsigned NegLo = 0;
unsigned NegHi = 0;
+ if (OpSelHiIdx != -1) {
+ OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
+ }
+
if (NegLoIdx != -1) {
int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
NegLo = Inst.getOperand(NegLoIdx).getImm();
@@ -4323,7 +4519,6 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
if (getLexer().isNot(AsmToken::RBrac))
return MatchOperand_ParseFail;
Parser.Lex();
-
} else {
// sel:%d
Parser.Lex();
@@ -4383,6 +4578,11 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
}
+ // All DPP instructions with at least one source operand have a fake "old"
+ // source at the beginning that's tied to the dst operand. Handle it here.
+ if (Desc.getNumOperands() >= 2)
+ Inst.addOperand(Inst.getOperand(0));
+
for (unsigned E = Operands.size(); I != E; ++I) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
// Add the register arguments
@@ -4405,16 +4605,6 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
-
- // special case v_mac_{f16, f32}:
- // it has src2 register operand that is tied to dst operand
- if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp ||
- Inst.getOpcode() == AMDGPU::V_MAC_F16_dpp) {
- auto it = Inst.begin();
- std::advance(
- it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
- Inst.insert(it, Inst.getOperand(0)); // src2 = dst
- }
}
//===----------------------------------------------------------------------===//
@@ -4503,6 +4693,7 @@ void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
uint64_t BasicInstType, bool skipVcc) {
using namespace llvm::AMDGPU::SDWA;
+
OptionalImmIndexMap OptionalIdx;
bool skippedVcc = false;
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 2e96c14eaa32..2230457b3a9b 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -11,8 +11,8 @@ def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
-def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantRoot]>;
-def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantRoot], 20>;
+def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
+def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
@@ -425,45 +425,51 @@ class MUBUF_SetupAddr<int addrKind> {
class MUBUF_Load_Pseudo <string opName,
int addrKind,
RegisterClass vdataClass,
+ bit HasTiedDest = 0,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind>
: MUBUF_Pseudo<opName,
(outs vdataClass:$vdata),
- getMUBUFIns<addrKindCopy>.ret,
+ !con(getMUBUFIns<addrKindCopy>.ret, !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
" $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+ let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
let mayLoad = 1;
let mayStore = 0;
+ let maybeAtomic = 1;
}
// FIXME: tfe can't be an operand because it requires a separate
// opcode because it needs an N+1 register class dest register.
multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
ValueType load_vt = i32,
- SDPatternOperator ld = null_frag> {
+ SDPatternOperator ld = null_frag,
+ bit TiedDest = 0> {
def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ TiedDest,
[(set load_vt:$vdata,
(ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
MUBUFAddr64Table<0>;
def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ TiedDest,
[(set load_vt:$vdata,
(ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
MUBUFAddr64Table<1>;
- def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>;
+ def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>;
+ def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
- def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest>;
+ def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>;
+ def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>;
+ def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>;
}
}
@@ -483,6 +489,7 @@ class MUBUF_Store_Pseudo <string opName,
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let mayLoad = 0;
let mayStore = 1;
+ let maybeAtomic = 1;
}
multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
@@ -566,6 +573,7 @@ class MUBUF_Atomic_Pseudo<string opName,
let DisableWQM = 1;
let has_glc = 0;
let has_tfe = 0;
+ let maybeAtomic = 1;
}
class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
@@ -617,21 +625,21 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
- def _RTN_OFFSET : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(set vdataType:$vdata,
(atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
vdataType:$vdata_in))]>,
MUBUFAddr64Table <0, "_RTN">;
- def _RTN_ADDR64 : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(set vdataType:$vdata,
(atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc),
vdataType:$vdata_in))]>,
MUBUFAddr64Table <1, "_RTN">;
- def _RTN_OFFEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _RTN_IDXEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _RTN_BOTHEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
}
@@ -639,8 +647,6 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
// MUBUF Instructions
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN in {
-
defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads <
"buffer_load_format_x", VGPR_32
>;
@@ -696,16 +702,16 @@ defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores <
"buffer_store_short", VGPR_32, i32, truncstorei16_global
>;
defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores <
- "buffer_store_dword", VGPR_32, i32, global_store
+ "buffer_store_dword", VGPR_32, i32, store_global
>;
defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx2", VReg_64, v2i32, global_store
+ "buffer_store_dwordx2", VReg_64, v2i32, store_global
>;
defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx3", VReg_96, untyped, global_store
+ "buffer_store_dwordx3", VReg_96, untyped, store_global
>;
defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx4", VReg_128, v4i32, global_store
+ "buffer_store_dwordx4", VReg_128, v4i32, store_global
>;
defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
"buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
@@ -802,6 +808,42 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
int_amdgcn_buffer_wbinvl1_sc>;
}
+let SubtargetPredicate = HasD16LoadStore in {
+
+defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads <
+ "buffer_load_ubyte_d16", VGPR_32, i32, null_frag, 1
+>;
+
+defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads <
+ "buffer_load_ubyte_d16_hi", VGPR_32, i32, null_frag, 1
+>;
+
+defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads <
+ "buffer_load_sbyte_d16", VGPR_32, i32, null_frag, 1
+>;
+
+defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads <
+ "buffer_load_sbyte_d16_hi", VGPR_32, i32, null_frag, 1
+>;
+
+defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads <
+ "buffer_load_short_d16", VGPR_32, i32, null_frag, 1
+>;
+
+defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads <
+ "buffer_load_short_d16_hi", VGPR_32, i32, null_frag, 1
+>;
+
+defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores <
+ "buffer_store_byte_d16_hi", VGPR_32, i32
+>;
+
+defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
+ "buffer_store_short_d16_hi", VGPR_32, i32
+>;
+
+} // End HasD16LoadStore
+
def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
int_amdgcn_buffer_wbinvl1>;
@@ -818,8 +860,6 @@ defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",
defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>;
defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
-} // End let SubtargetPredicate = isGCN
-
let SubtargetPredicate = isCIVI in {
//===----------------------------------------------------------------------===//
@@ -838,22 +878,13 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
// MUBUF Patterns
//===----------------------------------------------------------------------===//
-let Predicates = [isGCN] in {
-
-// Offset in an 32-bit VGPR
-def : Pat <
- (SIload_constant v4i32:$sbase, i32:$voff),
- (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0)
->;
-
-
//===----------------------------------------------------------------------===//
// buffer_load/store_format patterns
//===----------------------------------------------------------------------===//
multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
- def : Pat<
+ def : GCNPat<
(vt (name v4i32:$rsrc, 0,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$glc, imm:$slc)),
@@ -861,7 +892,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
- def : Pat<
+ def : GCNPat<
(vt (name v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$glc, imm:$slc)),
@@ -869,7 +900,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
- def : Pat<
+ def : GCNPat<
(vt (name v4i32:$rsrc, 0,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$glc, imm:$slc)),
@@ -877,7 +908,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
- def : Pat<
+ def : GCNPat<
(vt (name v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$glc, imm:$slc)),
@@ -897,7 +928,7 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
- def : Pat<
+ def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$glc, imm:$slc),
@@ -905,7 +936,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
- def : Pat<
+ def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$glc, imm:$slc),
@@ -914,7 +945,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(as_i1imm $slc), 0)
>;
- def : Pat<
+ def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$glc, imm:$slc),
@@ -923,7 +954,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(as_i1imm $slc), 0)
>;
- def : Pat<
+ def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$glc, imm:$slc),
@@ -935,107 +966,107 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
}
-defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
-defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
-defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
-defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, f32, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
//===----------------------------------------------------------------------===//
// buffer_atomic patterns
//===----------------------------------------------------------------------===//
multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> {
- def : Pat<
+ def : GCNPat<
(name i32:$vdata_in, v4i32:$rsrc, 0,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$slc),
- (!cast<MUBUF_Pseudo>(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset,
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset,
(as_i16imm $offset), (as_i1imm $slc))
>;
- def : Pat<
+ def : GCNPat<
(name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$slc),
- (!cast<MUBUF_Pseudo>(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (as_i1imm $slc))
>;
- def : Pat<
+ def : GCNPat<
(name i32:$vdata_in, v4i32:$rsrc, 0,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$slc),
- (!cast<MUBUF_Pseudo>(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (as_i1imm $slc))
>;
- def : Pat<
+ def : GCNPat<
(name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$slc),
- (!cast<MUBUF_Pseudo>(opcode # _RTN_BOTHEN)
+ (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN)
$vdata_in,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc))
>;
}
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_swap, "BUFFER_ATOMIC_SWAP">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_add, "BUFFER_ATOMIC_ADD">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_sub, "BUFFER_ATOMIC_SUB">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smin, "BUFFER_ATOMIC_SMIN">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umin, "BUFFER_ATOMIC_UMIN">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smax, "BUFFER_ATOMIC_SMAX">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umax, "BUFFER_ATOMIC_UMAX">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_and, "BUFFER_ATOMIC_AND">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_or, "BUFFER_ATOMIC_OR">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_xor, "BUFFER_ATOMIC_XOR">;
-
-def : Pat<
- (int_amdgcn_buffer_atomic_cmpswap
+defm : BufferAtomicPatterns<SIbuffer_atomic_swap, "BUFFER_ATOMIC_SWAP">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_add, "BUFFER_ATOMIC_ADD">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_sub, "BUFFER_ATOMIC_SUB">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_smin, "BUFFER_ATOMIC_SMIN">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_umin, "BUFFER_ATOMIC_UMIN">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_smax, "BUFFER_ATOMIC_SMAX">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_umax, "BUFFER_ATOMIC_UMAX">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_and, "BUFFER_ATOMIC_AND">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_or, "BUFFER_ATOMIC_OR">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_xor, "BUFFER_ATOMIC_XOR">;
+
+def : GCNPat<
+ (SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, 0,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$slc),
(EXTRACT_SUBREG
- (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET
+ (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
$rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
sub0)
>;
-def : Pat<
- (int_amdgcn_buffer_atomic_cmpswap
+def : GCNPat<
+ (SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$slc),
(EXTRACT_SUBREG
- (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN
+ (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
$vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
sub0)
>;
-def : Pat<
- (int_amdgcn_buffer_atomic_cmpswap
+def : GCNPat<
+ (SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, 0,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$slc),
(EXTRACT_SUBREG
- (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN
+ (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
$voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
sub0)
>;
-def : Pat<
- (int_amdgcn_buffer_atomic_cmpswap
+def : GCNPat<
+ (SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$slc),
(EXTRACT_SUBREG
- (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN
+ (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
@@ -1044,7 +1075,7 @@ def : Pat<
class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
- PatFrag constant_ld> : Pat <
+ PatFrag constant_ld> : GCNPat <
(vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
(Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
@@ -1052,19 +1083,19 @@ class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_ld> {
- def : Pat <
+ def : GCNPat <
(vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$slc))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0)
>;
- def : Pat <
+ def : GCNPat <
(vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
- (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
+ (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0)
>;
}
-let Predicates = [isSICI] in {
+let SubtargetPredicate = isSICI in {
def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
@@ -1072,52 +1103,123 @@ def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_con
defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>;
defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>;
-} // End Predicates = [isSICI]
+} // End SubtargetPredicate = isSICI
multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
PatFrag ld> {
- def : Pat <
+ def : GCNPat <
(vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
(Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe)
>;
}
-let Predicates = [Has16BitInsts] in {
+let OtherPredicates = [Has16BitInsts] in {
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, mubuf_sextloadi8>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>;
-} // End Predicates = [Has16BitInsts]
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_OFFSET, i16, mubuf_load>;
+
+} // End OtherPredicates = [Has16BitInsts]
multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
MUBUF_Pseudo InstrOffset,
ValueType vt, PatFrag ld> {
- def : Pat <
+ def : GCNPat <
(vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset))),
(InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
>;
- def : Pat <
+ def : GCNPat <
(vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
(InstrOffset $srsrc, $soffset, $offset, 0, 0, 0)
>;
}
+// XXX - Is it possible to have a complex pattern in a PatFrag?
+multiclass MUBUFScratchLoadPat_Hi16 <MUBUF_Pseudo InstrOffen,
+ MUBUF_Pseudo InstrOffset,
+ ValueType vt, PatFrag ld> {
+ def : GCNPat <
+ (build_vector vt:$lo, (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
+ i32:$soffset, u16imm:$offset)))),
+ (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
+ >;
+
+ def : GCNPat <
+ (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
+ i32:$soffset, u16imm:$offset)))))),
+ (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
+ >;
+
+
+ def : GCNPat <
+ (build_vector vt:$lo, (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))),
+ (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
+ >;
+
+ def : GCNPat <
+ (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))))),
+ (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
+ >;
+}
+
+multiclass MUBUFScratchLoadPat_Lo16 <MUBUF_Pseudo InstrOffen,
+ MUBUF_Pseudo InstrOffset,
+ ValueType vt, PatFrag ld> {
+ def : GCNPat <
+ (build_vector (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
+ i32:$soffset, u16imm:$offset))),
+ (vt (Hi16Elt vt:$hi))),
+ (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
+ >;
+
+ def : GCNPat <
+ (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
+ i32:$soffset, u16imm:$offset))))),
+ (f16 (Hi16Elt f16:$hi))),
+ (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
+ >;
+
+ def : GCNPat <
+ (build_vector (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
+ (vt (Hi16Elt vt:$hi))),
+ (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
+ >;
+
+ def : GCNPat <
+ (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))))),
+ (f16 (Hi16Elt f16:$hi))),
+ (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
+ >;
+}
+
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, az_extloadi8_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, i32, sextloadi16_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, az_extloadi16_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
+let OtherPredicates = [HasD16LoadStore] in {
+defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
+defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
+defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
+
+defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, i16, load_private>;
+defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>;
+defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>;
+}
+
// BUFFER_LOAD_DWORD*, addr64=0
multiclass MUBUF_Load_Dword <ValueType vt,
MUBUF_Pseudo offset,
@@ -1125,7 +1227,7 @@ multiclass MUBUF_Load_Dword <ValueType vt,
MUBUF_Pseudo idxen,
MUBUF_Pseudo bothen> {
- def : Pat <
+ def : GCNPat <
(vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset,
imm:$offset, 0, 0, imm:$glc, imm:$slc,
imm:$tfe)),
@@ -1133,7 +1235,7 @@ multiclass MUBUF_Load_Dword <ValueType vt,
(as_i1imm $slc), (as_i1imm $tfe))
>;
- def : Pat <
+ def : GCNPat <
(vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
imm:$offset, 1, 0, imm:$glc, imm:$slc,
imm:$tfe)),
@@ -1141,7 +1243,7 @@ multiclass MUBUF_Load_Dword <ValueType vt,
(as_i1imm $tfe))
>;
- def : Pat <
+ def : GCNPat <
(vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
imm:$offset, 0, 1, imm:$glc, imm:$slc,
imm:$tfe)),
@@ -1149,7 +1251,7 @@ multiclass MUBUF_Load_Dword <ValueType vt,
(as_i1imm $slc), (as_i1imm $tfe))
>;
- def : Pat <
+ def : GCNPat <
(vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
imm:$offset, 1, 1, imm:$glc, imm:$slc,
imm:$tfe)),
@@ -1168,27 +1270,27 @@ defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_
multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_st> {
// Store follows atomic op convention so address is forst
- def : Pat <
+ def : GCNPat <
(atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$slc), vt:$val),
- (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
+ (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0)
>;
- def : Pat <
+ def : GCNPat <
(atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
- (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
+ (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0)
>;
}
-let Predicates = [isSICI] in {
-defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, global_store_atomic>;
-defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>;
-} // End Predicates = [isSICI]
+let SubtargetPredicate = isSICI in {
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, store_atomic_global>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, store_atomic_global>;
+} // End Predicates = isSICI
multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
PatFrag st> {
- def : Pat <
+ def : GCNPat <
(st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe)),
(Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
@@ -1196,18 +1298,18 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
}
defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, store_global>;
multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
MUBUF_Pseudo InstrOffset,
ValueType vt, PatFrag st> {
- def : Pat <
+ def : GCNPat <
(st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset)),
(InstrOffen $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
>;
- def : Pat <
+ def : GCNPat <
(st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
u16imm:$offset)),
(InstrOffset $value, $srsrc, $soffset, $offset, 0, 0, 0)
@@ -1222,6 +1324,16 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>;
+
+let OtherPredicates = [HasD16LoadStore] in {
+ // Hiding the extract high pattern in the PatFrag seems to not
+ // automatically increase the complexity.
+let AddedComplexity = 1 in {
+defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_D16_HI_OFFEN, BUFFER_STORE_SHORT_D16_HI_OFFSET, i32, store_hi16_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D16_HI_OFFSET, i32, truncstorei8_hi16_private>;
+}
+}
+
//===----------------------------------------------------------------------===//
// MTBUF Patterns
//===----------------------------------------------------------------------===//
@@ -1232,28 +1344,28 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OF
multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
- def : Pat<
+ def : GCNPat<
(vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
>;
- def : Pat<
+ def : GCNPat<
(vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
>;
- def : Pat<
+ def : GCNPat<
(vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
>;
- def : Pat<
+ def : GCNPat<
(vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
@@ -1272,7 +1384,7 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">
multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
- def : Pat<
+ def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
@@ -1281,7 +1393,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(as_i1imm $slc), 0)
>;
- def : Pat<
+ def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
@@ -1290,7 +1402,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(as_i1imm $slc), 0)
>;
- def : Pat<
+ def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
@@ -1299,7 +1411,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(as_i1imm $slc), 0)
>;
- def : Pat<
+ def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
@@ -1319,8 +1431,6 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY"
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
-} // End let Predicates = [isGCN]
-
//===----------------------------------------------------------------------===//
// Target instructions, move to the appropriate target TD file
//===----------------------------------------------------------------------===//
@@ -1361,11 +1471,11 @@ multiclass MUBUF_Real_AllAddr_si<bits<7> op> {
}
multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
- def _RTN_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFSET")>;
- def _RTN_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_ADDR64")>;
- def _RTN_OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFEN")>;
- def _RTN_IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_IDXEN")>;
- def _RTN_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_BOTHEN")>;
+ def _OFFSET_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
+ def _ADDR64_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>;
+ def _OFFEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
+ def _IDXEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
+ def _BOTHEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
}
defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_si <0x00>;
@@ -1520,10 +1630,10 @@ multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
MUBUF_Real_AllAddr_vi<op> {
- def _RTN_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFSET")>;
- def _RTN_OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFEN")>;
- def _RTN_IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_IDXEN")>;
- def _RTN_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_BOTHEN")>;
+ def _OFFSET_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
+ def _OFFEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
+ def _IDXEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
+ def _BOTHEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
}
defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_vi <0x00>;
@@ -1543,12 +1653,21 @@ defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_vi <0x15>;
defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_vi <0x16>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_vi <0x17>;
defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_vi <0x18>;
+defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x19>;
defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_vi <0x1a>;
+defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_vi <0x1b>;
defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_vi <0x1c>;
defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_vi <0x1d>;
defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_vi <0x1e>;
defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_vi <0x1f>;
+defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Real_AllAddr_vi <0x20>;
+defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x21>;
+defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Real_AllAddr_vi <0x22>;
+defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x23>;
+defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_vi <0x24>;
+defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_vi <0x25>;
+
defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_vi <0x40>;
defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_vi <0x41>;
defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_vi <0x42>;
diff --git a/lib/Target/AMDGPU/CIInstructions.td b/lib/Target/AMDGPU/CIInstructions.td
deleted file mode 100644
index 26a483a8abf6..000000000000
--- a/lib/Target/AMDGPU/CIInstructions.td
+++ /dev/null
@@ -1,15 +0,0 @@
-//===-- CIInstructions.td - CI Instruction Defintions ---------------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Instruction definitions for CI and newer.
-//===----------------------------------------------------------------------===//
-// Remaining instructions:
-// S_CBRANCH_CDBGUSER
-// S_CBRANCH_CDBGSYS
-// S_CBRANCH_CDBGSYS_OR_USER
-// S_CBRANCH_CDBGSYS_AND_USER \ No newline at end of file
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 971208c5db84..3a8503030414 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -12,57 +12,52 @@ tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler)
tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering)
-if(LLVM_BUILD_GLOBAL_ISEL)
- tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank)
-endif()
+tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank)
add_public_tablegen_target(AMDGPUCommonTableGen)
-# List of all GlobalISel files.
-set(GLOBAL_ISEL_FILES
- AMDGPUCallLowering.cpp
- AMDGPUInstructionSelector.cpp
- AMDGPULegalizerInfo.cpp
- AMDGPURegisterBankInfo.cpp
- )
-
-# Add GlobalISel files to the dependencies if the user wants to build it.
-if(LLVM_BUILD_GLOBAL_ISEL)
- set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES})
-else()
- set(GLOBAL_ISEL_BUILD_FILES"")
- set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES})
-endif()
-
-
add_llvm_target(AMDGPUCodeGen
- AMDILCFGStructurizer.cpp
AMDGPUAliasAnalysis.cpp
AMDGPUAlwaysInlinePass.cpp
AMDGPUAnnotateKernelFeatures.cpp
AMDGPUAnnotateUniformValues.cpp
+ AMDGPUArgumentUsageInfo.cpp
AMDGPUAsmPrinter.cpp
+ AMDGPUCallLowering.cpp
AMDGPUCodeGenPrepare.cpp
AMDGPUFrameLowering.cpp
- AMDGPUTargetObjectFile.cpp
+ AMDGPUInstrInfo.cpp
+ AMDGPUInstructionSelector.cpp
AMDGPUIntrinsicInfo.cpp
AMDGPUISelDAGToDAG.cpp
+ AMDGPUISelLowering.cpp
+ AMDGPULegalizerInfo.cpp
+ AMDGPULibCalls.cpp
+ AMDGPULibFunc.cpp
AMDGPULowerIntrinsics.cpp
- AMDGPUMacroFusion.cpp
- AMDGPUMCInstLower.cpp
AMDGPUMachineCFGStructurizer.cpp
AMDGPUMachineFunction.cpp
- AMDGPUUnifyMetadata.cpp
+ AMDGPUMachineModuleInfo.cpp
+ AMDGPUMacroFusion.cpp
+ AMDGPUMCInstLower.cpp
+ AMDGPUOpenCLEnqueuedBlockLowering.cpp
AMDGPUOpenCLImageTypeLoweringPass.cpp
- AMDGPUSubtarget.cpp
- AMDGPUTargetMachine.cpp
- AMDGPUTargetTransformInfo.cpp
- AMDGPUISelLowering.cpp
- AMDGPUInstrInfo.cpp
AMDGPUPromoteAlloca.cpp
AMDGPURegAsmNames.inc.cpp
+ AMDGPURegisterBankInfo.cpp
AMDGPURegisterInfo.cpp
+ AMDGPURewriteOutArguments.cpp
+ AMDGPUSubtarget.cpp
+ AMDGPUTargetMachine.cpp
+ AMDGPUTargetObjectFile.cpp
+ AMDGPUTargetTransformInfo.cpp
AMDGPUUnifyDivergentExitNodes.cpp
+ AMDGPUUnifyMetadata.cpp
+ AMDGPUInline.cpp
+ AMDILCFGStructurizer.cpp
GCNHazardRecognizer.cpp
+ GCNIterativeScheduler.cpp
+ GCNMinRegStrategy.cpp
+ GCNRegPressure.cpp
GCNSchedStrategy.cpp
R600ClauseMergePass.cpp
R600ControlFlowFinalizer.cpp
@@ -78,14 +73,14 @@ add_llvm_target(AMDGPUCodeGen
R600RegisterInfo.cpp
SIAnnotateControlFlow.cpp
SIDebuggerInsertNops.cpp
- SIFixControlFlowLiveIntervals.cpp
SIFixSGPRCopies.cpp
SIFixVGPRCopies.cpp
+ SIFixWWMLiveness.cpp
SIFoldOperands.cpp
SIFrameLowering.cpp
SIInsertSkips.cpp
- SIInsertWaits.cpp
SIInsertWaitcnts.cpp
+ SIInsertWaits.cpp
SIInstrInfo.cpp
SIISelLowering.cpp
SILoadStoreOptimizer.cpp
@@ -93,15 +88,14 @@ add_llvm_target(AMDGPUCodeGen
SILowerI1Copies.cpp
SIMachineFunctionInfo.cpp
SIMachineScheduler.cpp
+ SIMemoryLegalizer.cpp
SIOptimizeExecMasking.cpp
+ SIOptimizeExecMaskingPreRA.cpp
SIPeepholeSDWA.cpp
SIRegisterInfo.cpp
SIShrinkInstructions.cpp
SIWholeQuadMode.cpp
- GCNIterativeScheduler.cpp
- GCNMinRegStrategy.cpp
- GCNRegPressure.cpp
- ${GLOBAL_ISEL_BUILD_FILES}
+ GCNILPSched.cpp
)
add_subdirectory(AsmParser)
diff --git a/lib/Target/AMDGPU/CaymanInstructions.td b/lib/Target/AMDGPU/CaymanInstructions.td
index 6b8e85a73c73..ae40c6387982 100644
--- a/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/lib/Target/AMDGPU/CaymanInstructions.td
@@ -18,7 +18,7 @@ def isCayman : Predicate<"Subtarget->hasCaymanISA()">;
// Cayman Instructions
//===----------------------------------------------------------------------===//
-let Predicates = [isCayman] in {
+let SubtargetPredicate = isCayman in {
def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
[(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU
@@ -57,26 +57,27 @@ defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
// RECIP_UINT emulation for Cayman
// The multiplication scales from [0,1] to the unsigned integer range
-def : Pat <
+def : R600Pat <
(AMDGPUurecip i32:$src0),
(FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)),
(MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1)))
>;
- def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
+def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
let ADDR = 0;
let POP_COUNT = 0;
let COUNT = 0;
}
-def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
+
+def : R600Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
CF_MEM_RAT_CACHELESS <0x14, 0, mask,
(ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
"STORE_DWORD $rw_gpr, $index_gpr",
- [(global_store vt:$rw_gpr, i32:$index_gpr)]> {
+ [(store_global vt:$rw_gpr, i32:$index_gpr)]> {
let eop = 0; // This bit is not used on Cayman.
}
@@ -143,8 +144,8 @@ def VTX_READ_32_cm
// to be caused by ALU instructions in the next instruction group that wrote
// to the $src_gpr registers of the VTX_READ.
// e.g.
- // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
- // %T2_X<def> = MOV %ZERO
+ // %t3_x = VTX_READ_PARAM_32_eg killed %t2_x, 24
+ // %t2_x = MOV %zero
//Adding this constraint prevents this from happening.
let Constraints = "$src_gpr.ptr = $dst_gpr";
}
@@ -179,44 +180,43 @@ def VTX_READ_128_cm
//===----------------------------------------------------------------------===//
// VTX Read from parameter memory space
//===----------------------------------------------------------------------===//
-def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)),
(VTX_READ_8_cm MEMxi:$src_gpr, 3)>;
-def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)),
(VTX_READ_16_cm MEMxi:$src_gpr, 3)>;
-def : Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_32_cm MEMxi:$src_gpr, 3)>;
-def : Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_64_cm MEMxi:$src_gpr, 3)>;
-def : Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_128_cm MEMxi:$src_gpr, 3)>;
//===----------------------------------------------------------------------===//
// VTX Read from constant memory space
//===----------------------------------------------------------------------===//
-def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)),
(VTX_READ_8_cm MEMxi:$src_gpr, 2)>;
-def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)),
(VTX_READ_16_cm MEMxi:$src_gpr, 2)>;
-def : Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_32_cm MEMxi:$src_gpr, 2)>;
-def : Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_64_cm MEMxi:$src_gpr, 2)>;
-def : Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_128_cm MEMxi:$src_gpr, 2)>;
//===----------------------------------------------------------------------===//
// VTX Read from global memory space
//===----------------------------------------------------------------------===//
-def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)),
(VTX_READ_8_cm MEMxi:$src_gpr, 1)>;
-def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)),
(VTX_READ_16_cm MEMxi:$src_gpr, 1)>;
-def : Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_32_cm MEMxi:$src_gpr, 1)>;
-def : Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_64_cm MEMxi:$src_gpr, 1)>;
-def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+def : R600Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_128_cm MEMxi:$src_gpr, 1)>;
-} // End isCayman
-
+} // End let SubtargetPredicate = isCayman
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index fc516c3b39c2..f898fd7948cc 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -17,7 +17,6 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
let DS = 1;
let Size = 8;
let UseNamedOperandTable = 1;
- let Uses = [M0, EXEC];
// Most instruction load and store data, so set this as the default.
let mayLoad = 1;
@@ -47,6 +46,10 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
bits<1> has_gds = 1;
bits<1> gdsValue = 0; // if has_gds == 0 set gds to this value
+
+ bits<1> has_m0_read = 1;
+
+ let Uses = !if(has_m0_read, [M0, EXEC], [EXEC]);
}
class DS_Real <DS_Pseudo ds> :
@@ -81,23 +84,41 @@ class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs),
(ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
- "$addr, $data0$offset$gds">,
- AtomicNoRet<opName, 0> {
+ "$addr, $data0$offset$gds"> {
let has_data1 = 0;
let has_vdst = 0;
}
+multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
+ def "" : DS_1A1D_NORET<opName, rc>,
+ AtomicNoRet<opName, 0>;
+
+ let has_m0_read = 0 in {
+ def _gfx9 : DS_1A1D_NORET<opName, rc>,
+ AtomicNoRet<opName#"_gfx9", 0>;
+ }
+}
+
class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs),
(ins VGPR_32:$addr, rc:$data0, rc:$data1, offset:$offset, gds:$gds),
- "$addr, $data0, $data1"#"$offset"#"$gds">,
- AtomicNoRet<opName, 0> {
+ "$addr, $data0, $data1"#"$offset"#"$gds"> {
let has_vdst = 0;
}
+multiclass DS_1A2D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
+ def "" : DS_1A2D_NORET<opName, rc>,
+ AtomicNoRet<opName, 0>;
+
+ let has_m0_read = 0 in {
+ def _gfx9 : DS_1A2D_NORET<opName, rc>,
+ AtomicNoRet<opName#"_gfx9", 0>;
+ }
+}
+
class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs),
@@ -110,6 +131,14 @@ class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32>
let AsmMatchConverter = "cvtDSOffset01";
}
+multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterClass rc = VGPR_32> {
+ def "" : DS_1A2D_Off8_NORET<opName, rc>;
+
+ let has_m0_read = 0 in {
+ def _gfx9 : DS_1A2D_Off8_NORET<opName, rc>;
+ }
+}
+
class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs rc:$vdst),
@@ -120,6 +149,18 @@ class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32>
let has_data1 = 0;
}
+multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32,
+ string NoRetOp = ""> {
+ def "" : DS_1A1D_RET<opName, rc>,
+ AtomicNoRet<NoRetOp, !if(!eq(NoRetOp, ""), 0, 1)>;
+
+ let has_m0_read = 0 in {
+ def _gfx9 : DS_1A1D_RET<opName, rc>,
+ AtomicNoRet<!if(!eq(NoRetOp, ""), "", NoRetOp#"_gfx9"),
+ !if(!eq(NoRetOp, ""), 0, 1)>;
+ }
+}
+
class DS_1A2D_RET<string opName,
RegisterClass rc = VGPR_32,
RegisterClass src = rc>
@@ -131,6 +172,19 @@ class DS_1A2D_RET<string opName,
let hasPostISelHook = 1;
}
+multiclass DS_1A2D_RET_mc<string opName,
+ RegisterClass rc = VGPR_32,
+ string NoRetOp = "",
+ RegisterClass src = rc> {
+ def "" : DS_1A2D_RET<opName, rc, src>,
+ AtomicNoRet<NoRetOp, !if(!eq(NoRetOp, ""), 0, 1)>;
+
+ let has_m0_read = 0 in {
+ def _gfx9 : DS_1A2D_RET<opName, rc, src>,
+ AtomicNoRet<NoRetOp#"_gfx9", !if(!eq(NoRetOp, ""), 0, 1)>;
+ }
+}
+
class DS_1A2D_Off8_RET<string opName,
RegisterClass rc = VGPR_32,
RegisterClass src = rc>
@@ -145,16 +199,41 @@ class DS_1A2D_Off8_RET<string opName,
let hasPostISelHook = 1;
}
-class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, Operand ofs = offset>
+multiclass DS_1A2D_Off8_RET_mc<string opName,
+ RegisterClass rc = VGPR_32,
+ RegisterClass src = rc> {
+ def "" : DS_1A2D_Off8_RET<opName, rc, src>;
+
+ let has_m0_read = 0 in {
+ def _gfx9 : DS_1A2D_Off8_RET<opName, rc, src>;
+ }
+}
+
+
+class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset>
: DS_Pseudo<opName,
(outs rc:$vdst),
- (ins VGPR_32:$addr, ofs:$offset, gds:$gds),
+ !if(HasTiedOutput,
+ (ins VGPR_32:$addr, ofs:$offset, gds:$gds, rc:$vdst_in),
+ (ins VGPR_32:$addr, ofs:$offset, gds:$gds)),
"$vdst, $addr$offset$gds"> {
-
+ let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
+ let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
let has_data0 = 0;
let has_data1 = 0;
}
+multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset> {
+ def "" : DS_1A_RET<opName, rc, HasTiedOutput, ofs>;
+
+ let has_m0_read = 0 in {
+ def _gfx9 : DS_1A_RET<opName, rc, HasTiedOutput, ofs>;
+ }
+}
+
+class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> :
+ DS_1A_RET<opName, rc, 1>;
+
class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs rc:$vdst),
@@ -167,6 +246,14 @@ class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
let AsmMatchConverter = "cvtDSOffset01";
}
+multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> {
+ def "" : DS_1A_Off8_RET<opName, rc>;
+
+ let has_m0_read = 0 in {
+ def _gfx9 : DS_1A_Off8_RET<opName, rc>;
+ }
+}
+
class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
(outs VGPR_32:$vdst),
(ins VGPR_32:$addr, offset:$offset),
@@ -205,6 +292,15 @@ class DS_1A <string opName> : DS_Pseudo<opName,
let has_data1 = 0;
}
+multiclass DS_1A_mc <string opName> {
+ def "" : DS_1A<opName>;
+
+ let has_m0_read = 0 in {
+ def _gfx9 : DS_1A<opName>;
+ }
+}
+
+
class DS_GWS <string opName, dag ins, string asmOps>
: DS_Pseudo<opName, (outs), ins, asmOps> {
@@ -263,142 +359,115 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
let has_gds = 0;
}
-def DS_ADD_U32 : DS_1A1D_NORET<"ds_add_u32">;
-def DS_SUB_U32 : DS_1A1D_NORET<"ds_sub_u32">;
-def DS_RSUB_U32 : DS_1A1D_NORET<"ds_rsub_u32">;
-def DS_INC_U32 : DS_1A1D_NORET<"ds_inc_u32">;
-def DS_DEC_U32 : DS_1A1D_NORET<"ds_dec_u32">;
-def DS_MIN_I32 : DS_1A1D_NORET<"ds_min_i32">;
-def DS_MAX_I32 : DS_1A1D_NORET<"ds_max_i32">;
-def DS_MIN_U32 : DS_1A1D_NORET<"ds_min_u32">;
-def DS_MAX_U32 : DS_1A1D_NORET<"ds_max_u32">;
-def DS_AND_B32 : DS_1A1D_NORET<"ds_and_b32">;
-def DS_OR_B32 : DS_1A1D_NORET<"ds_or_b32">;
-def DS_XOR_B32 : DS_1A1D_NORET<"ds_xor_b32">;
-def DS_ADD_F32 : DS_1A1D_NORET<"ds_add_f32">;
-def DS_MIN_F32 : DS_1A1D_NORET<"ds_min_f32">;
-def DS_MAX_F32 : DS_1A1D_NORET<"ds_max_f32">;
+defm DS_ADD_U32 : DS_1A1D_NORET_mc<"ds_add_u32">;
+defm DS_SUB_U32 : DS_1A1D_NORET_mc<"ds_sub_u32">;
+defm DS_RSUB_U32 : DS_1A1D_NORET_mc<"ds_rsub_u32">;
+defm DS_INC_U32 : DS_1A1D_NORET_mc<"ds_inc_u32">;
+defm DS_DEC_U32 : DS_1A1D_NORET_mc<"ds_dec_u32">;
+defm DS_MIN_I32 : DS_1A1D_NORET_mc<"ds_min_i32">;
+defm DS_MAX_I32 : DS_1A1D_NORET_mc<"ds_max_i32">;
+defm DS_MIN_U32 : DS_1A1D_NORET_mc<"ds_min_u32">;
+defm DS_MAX_U32 : DS_1A1D_NORET_mc<"ds_max_u32">;
+defm DS_AND_B32 : DS_1A1D_NORET_mc<"ds_and_b32">;
+defm DS_OR_B32 : DS_1A1D_NORET_mc<"ds_or_b32">;
+defm DS_XOR_B32 : DS_1A1D_NORET_mc<"ds_xor_b32">;
+defm DS_ADD_F32 : DS_1A1D_NORET_mc<"ds_add_f32">;
+defm DS_MIN_F32 : DS_1A1D_NORET_mc<"ds_min_f32">;
+defm DS_MAX_F32 : DS_1A1D_NORET_mc<"ds_max_f32">;
let mayLoad = 0 in {
-def DS_WRITE_B8 : DS_1A1D_NORET<"ds_write_b8">;
-def DS_WRITE_B16 : DS_1A1D_NORET<"ds_write_b16">;
-def DS_WRITE_B32 : DS_1A1D_NORET<"ds_write_b32">;
-def DS_WRITE2_B32 : DS_1A2D_Off8_NORET<"ds_write2_b32">;
-def DS_WRITE2ST64_B32 : DS_1A2D_Off8_NORET<"ds_write2st64_b32">;
-}
-
-def DS_MSKOR_B32 : DS_1A2D_NORET<"ds_mskor_b32">;
-def DS_CMPST_B32 : DS_1A2D_NORET<"ds_cmpst_b32">;
-def DS_CMPST_F32 : DS_1A2D_NORET<"ds_cmpst_f32">;
-
-def DS_ADD_U64 : DS_1A1D_NORET<"ds_add_u64", VReg_64>;
-def DS_SUB_U64 : DS_1A1D_NORET<"ds_sub_u64", VReg_64>;
-def DS_RSUB_U64 : DS_1A1D_NORET<"ds_rsub_u64", VReg_64>;
-def DS_INC_U64 : DS_1A1D_NORET<"ds_inc_u64", VReg_64>;
-def DS_DEC_U64 : DS_1A1D_NORET<"ds_dec_u64", VReg_64>;
-def DS_MIN_I64 : DS_1A1D_NORET<"ds_min_i64", VReg_64>;
-def DS_MAX_I64 : DS_1A1D_NORET<"ds_max_i64", VReg_64>;
-def DS_MIN_U64 : DS_1A1D_NORET<"ds_min_u64", VReg_64>;
-def DS_MAX_U64 : DS_1A1D_NORET<"ds_max_u64", VReg_64>;
-def DS_AND_B64 : DS_1A1D_NORET<"ds_and_b64", VReg_64>;
-def DS_OR_B64 : DS_1A1D_NORET<"ds_or_b64", VReg_64>;
-def DS_XOR_B64 : DS_1A1D_NORET<"ds_xor_b64", VReg_64>;
-def DS_MSKOR_B64 : DS_1A2D_NORET<"ds_mskor_b64", VReg_64>;
+defm DS_WRITE_B8 : DS_1A1D_NORET_mc<"ds_write_b8">;
+defm DS_WRITE_B16 : DS_1A1D_NORET_mc<"ds_write_b16">;
+defm DS_WRITE_B32 : DS_1A1D_NORET_mc<"ds_write_b32">;
+defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET_mc<"ds_write2_b32">;
+defm DS_WRITE2ST64_B32: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b32">;
+
+
+let has_m0_read = 0 in {
+
+let SubtargetPredicate = HasD16LoadStore in {
+def DS_WRITE_B8_D16_HI : DS_1A1D_NORET<"ds_write_b8_d16_hi">;
+def DS_WRITE_B16_D16_HI : DS_1A1D_NORET<"ds_write_b16_d16_hi">;
+}
+
+let SubtargetPredicate = HasDSAddTid in {
+def DS_WRITE_ADDTID_B32 : DS_1A1D_NORET<"ds_write_addtid_b32">;
+}
+
+} // End has_m0_read = 0
+} // End mayLoad = 0
+
+defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">;
+defm DS_CMPST_B32 : DS_1A2D_NORET_mc<"ds_cmpst_b32">;
+defm DS_CMPST_F32 : DS_1A2D_NORET_mc<"ds_cmpst_f32">;
+
+defm DS_ADD_U64 : DS_1A1D_NORET_mc<"ds_add_u64", VReg_64>;
+defm DS_SUB_U64 : DS_1A1D_NORET_mc<"ds_sub_u64", VReg_64>;
+defm DS_RSUB_U64 : DS_1A1D_NORET_mc<"ds_rsub_u64", VReg_64>;
+defm DS_INC_U64 : DS_1A1D_NORET_mc<"ds_inc_u64", VReg_64>;
+defm DS_DEC_U64 : DS_1A1D_NORET_mc<"ds_dec_u64", VReg_64>;
+defm DS_MIN_I64 : DS_1A1D_NORET_mc<"ds_min_i64", VReg_64>;
+defm DS_MAX_I64 : DS_1A1D_NORET_mc<"ds_max_i64", VReg_64>;
+defm DS_MIN_U64 : DS_1A1D_NORET_mc<"ds_min_u64", VReg_64>;
+defm DS_MAX_U64 : DS_1A1D_NORET_mc<"ds_max_u64", VReg_64>;
+defm DS_AND_B64 : DS_1A1D_NORET_mc<"ds_and_b64", VReg_64>;
+defm DS_OR_B64 : DS_1A1D_NORET_mc<"ds_or_b64", VReg_64>;
+defm DS_XOR_B64 : DS_1A1D_NORET_mc<"ds_xor_b64", VReg_64>;
+defm DS_MSKOR_B64 : DS_1A2D_NORET_mc<"ds_mskor_b64", VReg_64>;
let mayLoad = 0 in {
-def DS_WRITE_B64 : DS_1A1D_NORET<"ds_write_b64", VReg_64>;
-def DS_WRITE2_B64 : DS_1A2D_Off8_NORET<"ds_write2_b64", VReg_64>;
-def DS_WRITE2ST64_B64 : DS_1A2D_Off8_NORET<"ds_write2st64_b64", VReg_64>;
-}
-def DS_CMPST_B64 : DS_1A2D_NORET<"ds_cmpst_b64", VReg_64>;
-def DS_CMPST_F64 : DS_1A2D_NORET<"ds_cmpst_f64", VReg_64>;
-def DS_MIN_F64 : DS_1A1D_NORET<"ds_min_f64", VReg_64>;
-def DS_MAX_F64 : DS_1A1D_NORET<"ds_max_f64", VReg_64>;
-
-def DS_ADD_RTN_U32 : DS_1A1D_RET<"ds_add_rtn_u32">,
- AtomicNoRet<"ds_add_u32", 1>;
-def DS_ADD_RTN_F32 : DS_1A1D_RET<"ds_add_rtn_f32">,
- AtomicNoRet<"ds_add_f32", 1>;
-def DS_SUB_RTN_U32 : DS_1A1D_RET<"ds_sub_rtn_u32">,
- AtomicNoRet<"ds_sub_u32", 1>;
-def DS_RSUB_RTN_U32 : DS_1A1D_RET<"ds_rsub_rtn_u32">,
- AtomicNoRet<"ds_rsub_u32", 1>;
-def DS_INC_RTN_U32 : DS_1A1D_RET<"ds_inc_rtn_u32">,
- AtomicNoRet<"ds_inc_u32", 1>;
-def DS_DEC_RTN_U32 : DS_1A1D_RET<"ds_dec_rtn_u32">,
- AtomicNoRet<"ds_dec_u32", 1>;
-def DS_MIN_RTN_I32 : DS_1A1D_RET<"ds_min_rtn_i32">,
- AtomicNoRet<"ds_min_i32", 1>;
-def DS_MAX_RTN_I32 : DS_1A1D_RET<"ds_max_rtn_i32">,
- AtomicNoRet<"ds_max_i32", 1>;
-def DS_MIN_RTN_U32 : DS_1A1D_RET<"ds_min_rtn_u32">,
- AtomicNoRet<"ds_min_u32", 1>;
-def DS_MAX_RTN_U32 : DS_1A1D_RET<"ds_max_rtn_u32">,
- AtomicNoRet<"ds_max_u32", 1>;
-def DS_AND_RTN_B32 : DS_1A1D_RET<"ds_and_rtn_b32">,
- AtomicNoRet<"ds_and_b32", 1>;
-def DS_OR_RTN_B32 : DS_1A1D_RET<"ds_or_rtn_b32">,
- AtomicNoRet<"ds_or_b32", 1>;
-def DS_XOR_RTN_B32 : DS_1A1D_RET<"ds_xor_rtn_b32">,
- AtomicNoRet<"ds_xor_b32", 1>;
-def DS_MSKOR_RTN_B32 : DS_1A2D_RET<"ds_mskor_rtn_b32">,
- AtomicNoRet<"ds_mskor_b32", 1>;
-def DS_CMPST_RTN_B32 : DS_1A2D_RET <"ds_cmpst_rtn_b32">,
- AtomicNoRet<"ds_cmpst_b32", 1>;
-def DS_CMPST_RTN_F32 : DS_1A2D_RET <"ds_cmpst_rtn_f32">,
- AtomicNoRet<"ds_cmpst_f32", 1>;
-def DS_MIN_RTN_F32 : DS_1A1D_RET <"ds_min_rtn_f32">,
- AtomicNoRet<"ds_min_f32", 1>;
-def DS_MAX_RTN_F32 : DS_1A1D_RET <"ds_max_rtn_f32">,
- AtomicNoRet<"ds_max_f32", 1>;
-
-def DS_WRXCHG_RTN_B32 : DS_1A1D_RET<"ds_wrxchg_rtn_b32">,
- AtomicNoRet<"", 1>;
-def DS_WRXCHG2_RTN_B32 : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>,
- AtomicNoRet<"", 1>;
-def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>,
- AtomicNoRet<"", 1>;
-
-def DS_ADD_RTN_U64 : DS_1A1D_RET<"ds_add_rtn_u64", VReg_64>,
- AtomicNoRet<"ds_add_u64", 1>;
-def DS_SUB_RTN_U64 : DS_1A1D_RET<"ds_sub_rtn_u64", VReg_64>,
- AtomicNoRet<"ds_sub_u64", 1>;
-def DS_RSUB_RTN_U64 : DS_1A1D_RET<"ds_rsub_rtn_u64", VReg_64>,
- AtomicNoRet<"ds_rsub_u64", 1>;
-def DS_INC_RTN_U64 : DS_1A1D_RET<"ds_inc_rtn_u64", VReg_64>,
- AtomicNoRet<"ds_inc_u64", 1>;
-def DS_DEC_RTN_U64 : DS_1A1D_RET<"ds_dec_rtn_u64", VReg_64>,
- AtomicNoRet<"ds_dec_u64", 1>;
-def DS_MIN_RTN_I64 : DS_1A1D_RET<"ds_min_rtn_i64", VReg_64>,
- AtomicNoRet<"ds_min_i64", 1>;
-def DS_MAX_RTN_I64 : DS_1A1D_RET<"ds_max_rtn_i64", VReg_64>,
- AtomicNoRet<"ds_max_i64", 1>;
-def DS_MIN_RTN_U64 : DS_1A1D_RET<"ds_min_rtn_u64", VReg_64>,
- AtomicNoRet<"ds_min_u64", 1>;
-def DS_MAX_RTN_U64 : DS_1A1D_RET<"ds_max_rtn_u64", VReg_64>,
- AtomicNoRet<"ds_max_u64", 1>;
-def DS_AND_RTN_B64 : DS_1A1D_RET<"ds_and_rtn_b64", VReg_64>,
- AtomicNoRet<"ds_and_b64", 1>;
-def DS_OR_RTN_B64 : DS_1A1D_RET<"ds_or_rtn_b64", VReg_64>,
- AtomicNoRet<"ds_or_b64", 1>;
-def DS_XOR_RTN_B64 : DS_1A1D_RET<"ds_xor_rtn_b64", VReg_64>,
- AtomicNoRet<"ds_xor_b64", 1>;
-def DS_MSKOR_RTN_B64 : DS_1A2D_RET<"ds_mskor_rtn_b64", VReg_64>,
- AtomicNoRet<"ds_mskor_b64", 1>;
-def DS_CMPST_RTN_B64 : DS_1A2D_RET<"ds_cmpst_rtn_b64", VReg_64>,
- AtomicNoRet<"ds_cmpst_b64", 1>;
-def DS_CMPST_RTN_F64 : DS_1A2D_RET<"ds_cmpst_rtn_f64", VReg_64>,
- AtomicNoRet<"ds_cmpst_f64", 1>;
-def DS_MIN_RTN_F64 : DS_1A1D_RET<"ds_min_rtn_f64", VReg_64>,
- AtomicNoRet<"ds_min_f64", 1>;
-def DS_MAX_RTN_F64 : DS_1A1D_RET<"ds_max_rtn_f64", VReg_64>,
- AtomicNoRet<"ds_max_f64", 1>;
-
-def DS_WRXCHG_RTN_B64 : DS_1A1D_RET<"ds_wrxchg_rtn_b64", VReg_64>,
- AtomicNoRet<"", 1>;
-def DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>,
- AtomicNoRet<"", 1>;
-def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>,
- AtomicNoRet<"", 1>;
+defm DS_WRITE_B64 : DS_1A1D_NORET_mc<"ds_write_b64", VReg_64>;
+defm DS_WRITE2_B64 : DS_1A2D_Off8_NORET_mc<"ds_write2_b64", VReg_64>;
+defm DS_WRITE2ST64_B64: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b64", VReg_64>;
+}
+defm DS_CMPST_B64 : DS_1A2D_NORET_mc<"ds_cmpst_b64", VReg_64>;
+defm DS_CMPST_F64 : DS_1A2D_NORET_mc<"ds_cmpst_f64", VReg_64>;
+defm DS_MIN_F64 : DS_1A1D_NORET_mc<"ds_min_f64", VReg_64>;
+defm DS_MAX_F64 : DS_1A1D_NORET_mc<"ds_max_f64", VReg_64>;
+
+defm DS_ADD_RTN_U32 : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32, "ds_add_u32">;
+defm DS_ADD_RTN_F32 : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32, "ds_add_f32">;
+defm DS_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">;
+defm DS_RSUB_RTN_U32 : DS_1A1D_RET_mc<"ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">;
+defm DS_INC_RTN_U32 : DS_1A1D_RET_mc<"ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">;
+defm DS_DEC_RTN_U32 : DS_1A1D_RET_mc<"ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">;
+defm DS_MIN_RTN_I32 : DS_1A1D_RET_mc<"ds_min_rtn_i32", VGPR_32, "ds_min_i32">;
+defm DS_MAX_RTN_I32 : DS_1A1D_RET_mc<"ds_max_rtn_i32", VGPR_32, "ds_max_i32">;
+defm DS_MIN_RTN_U32 : DS_1A1D_RET_mc<"ds_min_rtn_u32", VGPR_32, "ds_min_u32">;
+defm DS_MAX_RTN_U32 : DS_1A1D_RET_mc<"ds_max_rtn_u32", VGPR_32, "ds_max_u32">;
+defm DS_AND_RTN_B32 : DS_1A1D_RET_mc<"ds_and_rtn_b32", VGPR_32, "ds_and_b32">;
+defm DS_OR_RTN_B32 : DS_1A1D_RET_mc<"ds_or_rtn_b32", VGPR_32, "ds_or_b32">;
+defm DS_XOR_RTN_B32 : DS_1A1D_RET_mc<"ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
+defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
+defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
+defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
+defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc <"ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
+defm DS_MAX_RTN_F32 : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
+
+defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">;
+defm DS_WRXCHG2_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>;
+defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>;
+
+defm DS_ADD_RTN_U64 : DS_1A1D_RET_mc<"ds_add_rtn_u64", VReg_64, "ds_add_u64">;
+defm DS_SUB_RTN_U64 : DS_1A1D_RET_mc<"ds_sub_rtn_u64", VReg_64, "ds_sub_u64">;
+defm DS_RSUB_RTN_U64 : DS_1A1D_RET_mc<"ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">;
+defm DS_INC_RTN_U64 : DS_1A1D_RET_mc<"ds_inc_rtn_u64", VReg_64, "ds_inc_u64">;
+defm DS_DEC_RTN_U64 : DS_1A1D_RET_mc<"ds_dec_rtn_u64", VReg_64, "ds_dec_u64">;
+defm DS_MIN_RTN_I64 : DS_1A1D_RET_mc<"ds_min_rtn_i64", VReg_64, "ds_min_i64">;
+defm DS_MAX_RTN_I64 : DS_1A1D_RET_mc<"ds_max_rtn_i64", VReg_64, "ds_max_i64">;
+defm DS_MIN_RTN_U64 : DS_1A1D_RET_mc<"ds_min_rtn_u64", VReg_64, "ds_min_u64">;
+defm DS_MAX_RTN_U64 : DS_1A1D_RET_mc<"ds_max_rtn_u64", VReg_64, "ds_max_u64">;
+defm DS_AND_RTN_B64 : DS_1A1D_RET_mc<"ds_and_rtn_b64", VReg_64, "ds_and_b64">;
+defm DS_OR_RTN_B64 : DS_1A1D_RET_mc<"ds_or_rtn_b64", VReg_64, "ds_or_b64">;
+defm DS_XOR_RTN_B64 : DS_1A1D_RET_mc<"ds_xor_rtn_b64", VReg_64, "ds_xor_b64">;
+defm DS_MSKOR_RTN_B64 : DS_1A2D_RET_mc<"ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">;
+defm DS_CMPST_RTN_B64 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">;
+defm DS_CMPST_RTN_F64 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">;
+defm DS_MIN_RTN_F64 : DS_1A1D_RET_mc<"ds_min_rtn_f64", VReg_64, "ds_min_f64">;
+defm DS_MAX_RTN_F64 : DS_1A1D_RET_mc<"ds_max_rtn_f64", VReg_64, "ds_max_f64">;
+
+defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>;
+defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
+defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">;
def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">;
@@ -440,22 +509,37 @@ def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">;
def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
-def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, SwizzleImm>;
+def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, SwizzleImm>;
}
let mayStore = 0 in {
-def DS_READ_I8 : DS_1A_RET<"ds_read_i8">;
-def DS_READ_U8 : DS_1A_RET<"ds_read_u8">;
-def DS_READ_I16 : DS_1A_RET<"ds_read_i16">;
-def DS_READ_U16 : DS_1A_RET<"ds_read_u16">;
-def DS_READ_B32 : DS_1A_RET<"ds_read_b32">;
-def DS_READ_B64 : DS_1A_RET<"ds_read_b64", VReg_64>;
-
-def DS_READ2_B32 : DS_1A_Off8_RET<"ds_read2_b32", VReg_64>;
-def DS_READ2ST64_B32 : DS_1A_Off8_RET<"ds_read2st64_b32", VReg_64>;
+defm DS_READ_I8 : DS_1A_RET_mc<"ds_read_i8">;
+defm DS_READ_U8 : DS_1A_RET_mc<"ds_read_u8">;
+defm DS_READ_I16 : DS_1A_RET_mc<"ds_read_i16">;
+defm DS_READ_U16 : DS_1A_RET_mc<"ds_read_u16">;
+defm DS_READ_B32 : DS_1A_RET_mc<"ds_read_b32">;
+defm DS_READ_B64 : DS_1A_RET_mc<"ds_read_b64", VReg_64>;
+
+defm DS_READ2_B32 : DS_1A_Off8_RET_mc<"ds_read2_b32", VReg_64>;
+defm DS_READ2ST64_B32: DS_1A_Off8_RET_mc<"ds_read2st64_b32", VReg_64>;
+
+defm DS_READ2_B64 : DS_1A_Off8_RET_mc<"ds_read2_b64", VReg_128>;
+defm DS_READ2ST64_B64: DS_1A_Off8_RET_mc<"ds_read2st64_b64", VReg_128>;
+
+let has_m0_read = 0 in {
+let SubtargetPredicate = HasD16LoadStore in {
+def DS_READ_U8_D16 : DS_1A_RET_Tied<"ds_read_u8_d16">;
+def DS_READ_U8_D16_HI : DS_1A_RET_Tied<"ds_read_u8_d16_hi">;
+def DS_READ_I8_D16 : DS_1A_RET_Tied<"ds_read_i8_d16">;
+def DS_READ_I8_D16_HI : DS_1A_RET_Tied<"ds_read_i8_d16_hi">;
+def DS_READ_U16_D16 : DS_1A_RET_Tied<"ds_read_u16_d16">;
+def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">;
+}
-def DS_READ2_B64 : DS_1A_Off8_RET<"ds_read2_b64", VReg_128>;
-def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>;
+let SubtargetPredicate = HasDSAddTid in {
+def DS_READ_ADDTID_B32 : DS_1A_RET<"ds_read_addtid_b32">;
+}
+} // End has_m0_read = 0
}
def DS_CONSUME : DS_0A_RET<"ds_consume">;
@@ -468,21 +552,19 @@ def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
let SubtargetPredicate = isCIVI in {
-def DS_WRAP_RTN_B32 : DS_1A2D_RET<"ds_wrap_rtn_b32">, AtomicNoRet<"", 1>;
-
-def DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET<"ds_condxchg32_rtn_b64", VReg_64>,
- AtomicNoRet<"", 1>;
+defm DS_WRAP_RTN_B32 : DS_1A2D_RET_mc<"ds_wrap_rtn_b32", VGPR_32>;
+defm DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET_mc<"ds_condxchg32_rtn_b64", VReg_64>;
def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">;
let mayStore = 0 in {
-def DS_READ_B96 : DS_1A_RET<"ds_read_b96", VReg_96>;
-def DS_READ_B128: DS_1A_RET<"ds_read_b128", VReg_128>;
+defm DS_READ_B96 : DS_1A_RET_mc<"ds_read_b96", VReg_96>;
+defm DS_READ_B128: DS_1A_RET_mc<"ds_read_b128", VReg_128>;
} // End mayStore = 0
let mayLoad = 0 in {
-def DS_WRITE_B96 : DS_1A1D_NORET<"ds_write_b96", VReg_96>;
-def DS_WRITE_B128 : DS_1A1D_NORET<"ds_write_b128", VReg_128>;
+defm DS_WRITE_B96 : DS_1A1D_NORET_mc<"ds_write_b96", VReg_96>;
+defm DS_WRITE_B128 : DS_1A1D_NORET_mc<"ds_write_b128", VReg_128>;
} // End mayLoad = 0
def DS_NOP : DS_VOID<"ds_nop">;
@@ -508,107 +590,201 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
// DS Patterns
//===----------------------------------------------------------------------===//
-let Predicates = [isGCN] in {
-
-def : Pat <
+def : GCNPat <
(int_amdgcn_ds_swizzle i32:$src, imm:$offset16),
(DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
>;
-class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
(vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
(inst $ptr, (as_i16imm $offset), (i1 0))
>;
-def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>;
-def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>;
-def : DSReadPat <DS_READ_I8, i16, si_sextload_local_i8>;
-def : DSReadPat <DS_READ_U8, i16, si_az_extload_local_i8>;
-def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
-def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
-def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>;
-def : DSReadPat <DS_READ_U16, i16, si_load_local>;
-def : DSReadPat <DS_READ_B32, i32, si_load_local>;
+// FIXME: Passing name of PatFrag in workaround. Why doesn't
+// !cast<PatFrag>(frag.NAME#"_m0") work!?
+multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
+
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSReadPat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSReadPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+}
+
+
+multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
+ def : GCNPat <
+ (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))),
+ (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
+ >;
+
+ def : GCNPat <
+ (build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))),
+ (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
+ >;
+}
+
+multiclass DSReadPat_Lo16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
+ def : GCNPat <
+ (build_vector (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (vt (Hi16Elt vt:$hi))),
+ (v2i16 (inst $ptr, (as_i16imm $offset), 0, $hi))
+ >;
+
+ def : GCNPat <
+ (build_vector (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))))), (f16 (Hi16Elt f16:$hi))),
+ (v2f16 (inst $ptr, (as_i16imm $offset), 0, $hi))
+ >;
+}
+
+defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
+defm : DSReadPat_mc <DS_READ_U8, i32, "az_extloadi8_local">;
+defm : DSReadPat_mc <DS_READ_I8, i16, "sextloadi8_local">;
+defm : DSReadPat_mc <DS_READ_U8, i16, "az_extloadi8_local">;
+defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
+defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
+defm : DSReadPat_mc <DS_READ_U16, i32, "az_extloadi16_local">;
+defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
+defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">;
let AddedComplexity = 100 in {
-def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>;
+defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">;
} // End AddedComplexity = 100
-def : Pat <
- (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
- i8:$offset1))),
- (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
->;
+let OtherPredicates = [HasD16LoadStore] in {
+let AddedComplexity = 100 in {
+defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
+defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
+defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>;
+
+defm : DSReadPat_Lo16<DS_READ_U16_D16, load_local>;
+defm : DSReadPat_Lo16<DS_READ_U8_D16, az_extloadi8_local>;
+defm : DSReadPat_Lo16<DS_READ_I8_D16, sextloadi8_local>;
+
+}
+}
-class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
(frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
(inst $ptr, $value, (as_i16imm $offset), (i1 0))
>;
-def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>;
-def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>;
-def : DSWritePat <DS_WRITE_B8, i16, si_truncstore_local_i8>;
-def : DSWritePat <DS_WRITE_B16, i16, si_store_local>;
-def : DSWritePat <DS_WRITE_B32, i32, si_store_local>;
+multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
-let AddedComplexity = 100 in {
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSWritePat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+}
-def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>;
-} // End AddedComplexity = 100
+defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">;
+defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
+defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
+defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
+defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">;
+
+let OtherPredicates = [HasD16LoadStore] in {
+def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
+def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_local_hi16>;
+}
-def : Pat <
- (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
- i8:$offset1)),
- (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
- (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
- (i1 0))
+
+class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, PatFrag frag> : GCNPat <
+ (v2i32 (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
+ (inst $ptr, $offset0, $offset1, (i1 0))
+>;
+
+class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
+ (frag v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
+ (inst $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
+ (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
+ (i1 0))
>;
-class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+let OtherPredicates = [LDSRequiresM0Init] in {
+def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
+def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
+}
+
+let OtherPredicates = [NotLDSRequiresM0Init] in {
+def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, load_local>;
+def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
+}
+
+
+let AddedComplexity = 100 in {
+
+defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">;
+} // End AddedComplexity = 100
+class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
(inst $ptr, $value, (as_i16imm $offset), (i1 0))
>;
-class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+}
+
+
+
+class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
(inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
>;
+multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicCmpXChg<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+}
+
+
// 32-bit atomics.
-def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
-def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
-def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>;
-def : DSAtomicRetPat<DS_INC_RTN_U32, i32, si_atomic_inc_local>;
-def : DSAtomicRetPat<DS_DEC_RTN_U32, i32, si_atomic_dec_local>;
-def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>;
-def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>;
-def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>;
-def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
+defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap_local">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add_local">;
+defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub_local">;
+defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc_local">;
+defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec_local">;
+defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and_local">;
+defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or_local">;
+defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor_local">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min_local">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">;
+defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">;
// 64-bit atomics.
-def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
-def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
-def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>;
-def : DSAtomicRetPat<DS_INC_RTN_U64, i64, si_atomic_inc_local>;
-def : DSAtomicRetPat<DS_DEC_RTN_U64, i64, si_atomic_dec_local>;
-def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>;
-def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>;
-def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>;
-
-def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>;
-
-} // let Predicates = [isGCN]
+defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add_local">;
+defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub_local">;
+defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc_local">;
+defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec_local">;
+defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and_local">;
+defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or_local">;
+defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor_local">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min_local">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max_local">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin_local">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax_local">;
+
+defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap_local">;
//===----------------------------------------------------------------------===//
// Real instructions
@@ -834,6 +1010,7 @@ def DS_GWS_SEMA_V_vi : DS_Real_vi<0x9a, DS_GWS_SEMA_V>;
def DS_GWS_SEMA_BR_vi : DS_Real_vi<0x9b, DS_GWS_SEMA_BR>;
def DS_GWS_SEMA_P_vi : DS_Real_vi<0x9c, DS_GWS_SEMA_P>;
def DS_GWS_BARRIER_vi : DS_Real_vi<0x9d, DS_GWS_BARRIER>;
+def DS_WRITE_ADDTID_B32_vi : DS_Real_vi<0x1d, DS_WRITE_ADDTID_B32>;
def DS_WRITE_B8_vi : DS_Real_vi<0x1e, DS_WRITE_B8>;
def DS_WRITE_B16_vi : DS_Real_vi<0x1f, DS_WRITE_B16>;
def DS_ADD_RTN_U32_vi : DS_Real_vi<0x20, DS_ADD_RTN_U32>;
@@ -865,6 +1042,7 @@ def DS_READ_I8_vi : DS_Real_vi<0x39, DS_READ_I8>;
def DS_READ_U8_vi : DS_Real_vi<0x3a, DS_READ_U8>;
def DS_READ_I16_vi : DS_Real_vi<0x3b, DS_READ_I16>;
def DS_READ_U16_vi : DS_Real_vi<0x3c, DS_READ_U16>;
+def DS_READ_ADDTID_B32_vi : DS_Real_vi<0xb6, DS_READ_ADDTID_B32>;
def DS_CONSUME_vi : DS_Real_vi<0xbd, DS_CONSUME>;
def DS_APPEND_vi : DS_Real_vi<0xbe, DS_APPEND>;
def DS_ORDERED_COUNT_vi : DS_Real_vi<0xbf, DS_ORDERED_COUNT>;
@@ -893,6 +1071,16 @@ def DS_CMPST_F64_vi : DS_Real_vi<0x51, DS_CMPST_F64>;
def DS_MIN_F64_vi : DS_Real_vi<0x52, DS_MIN_F64>;
def DS_MAX_F64_vi : DS_Real_vi<0x53, DS_MAX_F64>;
+def DS_WRITE_B8_D16_HI_vi : DS_Real_vi<0x54, DS_WRITE_B8_D16_HI>;
+def DS_WRITE_B16_D16_HI_vi : DS_Real_vi<0x55, DS_WRITE_B16_D16_HI>;
+
+def DS_READ_U8_D16_vi : DS_Real_vi<0x56, DS_READ_U8_D16>;
+def DS_READ_U8_D16_HI_vi : DS_Real_vi<0x57, DS_READ_U8_D16_HI>;
+def DS_READ_I8_D16_vi : DS_Real_vi<0x58, DS_READ_I8_D16>;
+def DS_READ_I8_D16_HI_vi : DS_Real_vi<0x59, DS_READ_I8_D16_HI>;
+def DS_READ_U16_D16_vi : DS_Real_vi<0x5a, DS_READ_U16_D16>;
+def DS_READ_U16_D16_HI_vi : DS_Real_vi<0x5b, DS_READ_U16_D16_HI>;
+
def DS_ADD_RTN_U64_vi : DS_Real_vi<0x60, DS_ADD_RTN_U64>;
def DS_SUB_RTN_U64_vi : DS_Real_vi<0x61, DS_SUB_RTN_U64>;
def DS_RSUB_RTN_U64_vi : DS_Real_vi<0x62, DS_RSUB_RTN_U64>;
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 966c6fec20c6..4a3f2c975179 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA --------------===//
+//===- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA ---------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -17,29 +17,40 @@
// ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)?
-#include "AMDGPUDisassembler.h"
+#include "Disassembler/AMDGPUDisassembler.h"
#include "AMDGPU.h"
#include "AMDGPURegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
#include "Utils/AMDGPUBaseInfo.h"
-
+#include "llvm-c/Disassembler.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/Debug.h"
#include "llvm/Support/Endian.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <vector>
using namespace llvm;
#define DEBUG_TYPE "amdgpu-disassembler"
-typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
-
+using DecodeStatus = llvm::MCDisassembler::DecodeStatus;
inline static MCDisassembler::DecodeStatus
addOperand(MCInst &Inst, const MCOperand& Opnd) {
@@ -95,13 +106,13 @@ DECODE_OPERAND_REG(VReg_128)
DECODE_OPERAND_REG(SReg_32)
DECODE_OPERAND_REG(SReg_32_XM0_XEXEC)
+DECODE_OPERAND_REG(SReg_32_XEXEC_HI)
DECODE_OPERAND_REG(SReg_64)
DECODE_OPERAND_REG(SReg_64_XEXEC)
DECODE_OPERAND_REG(SReg_128)
DECODE_OPERAND_REG(SReg_256)
DECODE_OPERAND_REG(SReg_512)
-
static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
unsigned Imm,
uint64_t Addr,
@@ -201,12 +212,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address);
if (Res) break;
+ Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address);
+ if (Res) break;
+
if (Bytes.size() < 4) break;
const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
Res = tryDecodeInst(DecoderTableVI64, MI, QW, Address);
if (Res) break;
Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address);
+ if (Res) break;
+
+ Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address);
} while (false);
if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
@@ -217,6 +234,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
AMDGPU::OpName::src2_modifiers);
}
+ if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
+ Res = convertMIMGInst(MI);
+ }
+
if (Res && IsSDWA)
Res = convertSDWAInst(MI);
@@ -233,7 +254,7 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst);
if (SDst != -1) {
// VOPC - insert VCC register as sdst
- insertNamedMCOperand(MI, MCOperand::createReg(AMDGPU::VCC),
+ insertNamedMCOperand(MI, createRegOperand(AMDGPU::VCC),
AMDGPU::OpName::sdst);
} else {
// VOP1/2 - insert omod if present in instruction
@@ -243,6 +264,42 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
return MCDisassembler::Success;
}
+DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
+ int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vdata);
+
+ int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::dmask);
+ unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
+ if (DMask == 0)
+ return MCDisassembler::Success;
+
+ unsigned ChannelCount = countPopulation(DMask);
+ if (ChannelCount == 1)
+ return MCDisassembler::Success;
+
+ int NewOpcode = AMDGPU::getMaskedMIMGOp(*MCII, MI.getOpcode(), ChannelCount);
+ assert(NewOpcode != -1 && "could not find matching mimg channel instruction");
+ auto RCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass;
+
+ // Widen the register to the correct number of enabled channels.
+ unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
+ auto NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
+ &MRI.getRegClass(RCID));
+ if (NewVdata == AMDGPU::NoRegister) {
+ // It's possible to encode this such that the low register + enabled
+ // components exceeds the register count.
+ return MCDisassembler::Success;
+ }
+
+ MI.setOpcode(NewOpcode);
+ // vaddr will be always appear as a single VGPR. This will look different than
+ // how it is usually emitted because the number of register components is not
+ // in the instruction encoding.
+ MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
+ return MCDisassembler::Success;
+}
+
const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
return getContext().getRegisterInfo()->
getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
@@ -260,7 +317,7 @@ MCOperand AMDGPUDisassembler::errOperand(unsigned V,
inline
MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
- return MCOperand::createReg(RegId);
+ return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI));
}
inline
@@ -365,6 +422,12 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0_XEXEC(
return decodeOperand_SReg_32(Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XEXEC_HI(
+ unsigned Val) const {
+ // SReg_32_XM0 is SReg_32 without EXEC_HI
+ return decodeOperand_SReg_32(Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const {
return decodeSrcOp(OPW64, Val);
}
@@ -385,7 +448,6 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const {
return createSRegOperand(AMDGPU::SReg_512RegClassID, Val);
}
-
MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
// For now all literal constants are supposed to be unsigned integer
// ToDo: deal with signed/unsigned 64-bit integer constants
@@ -403,6 +465,7 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
using namespace AMDGPU::EncValues;
+
assert(Imm >= INLINE_INTEGER_C_MIN && Imm <= INLINE_INTEGER_C_MAX);
return MCOperand::createImm((Imm <= INLINE_INTEGER_C_POSITIVE_MAX) ?
(static_cast<int64_t>(Imm) - INLINE_INTEGER_C_MIN) :
@@ -505,6 +568,7 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
using namespace AMDGPU;
+
assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
switch (Width) {
default: // fall
@@ -519,6 +583,7 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
using namespace AMDGPU;
+
assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
switch (Width) {
default: // fall
@@ -533,6 +598,7 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
using namespace AMDGPU;
+
assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
switch (Width) {
default: // fall
@@ -545,8 +611,18 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
}
}
+int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
+ using namespace AMDGPU::EncValues;
+
+ unsigned TTmpMin = isGFX9() ? TTMP_GFX9_MIN : TTMP_VI_MIN;
+ unsigned TTmpMax = isGFX9() ? TTMP_GFX9_MAX : TTMP_VI_MAX;
+
+ return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1;
+}
+
MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) const {
using namespace AMDGPU::EncValues;
+
assert(Val < 512); // enum9
if (VGPR_MIN <= Val && Val <= VGPR_MAX) {
@@ -556,8 +632,10 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning.
return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
}
- if (TTMP_MIN <= Val && Val <= TTMP_MAX) {
- return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
+
+ int TTmpIdx = getTTmpIdx(Val);
+ if (TTmpIdx >= 0) {
+ return createSRegOperand(getTtmpClassId(Width), TTmpIdx);
}
if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
@@ -583,18 +661,19 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
using namespace AMDGPU;
+
switch (Val) {
- case 102: return createRegOperand(getMCReg(FLAT_SCR_LO, STI));
- case 103: return createRegOperand(getMCReg(FLAT_SCR_HI, STI));
+ case 102: return createRegOperand(FLAT_SCR_LO);
+ case 103: return createRegOperand(FLAT_SCR_HI);
// ToDo: no support for xnack_mask_lo/_hi register
case 104:
case 105: break;
case 106: return createRegOperand(VCC_LO);
case 107: return createRegOperand(VCC_HI);
- case 108: return createRegOperand(TBA_LO);
- case 109: return createRegOperand(TBA_HI);
- case 110: return createRegOperand(TMA_LO);
- case 111: return createRegOperand(TMA_HI);
+ case 108: assert(!isGFX9()); return createRegOperand(TBA_LO);
+ case 109: assert(!isGFX9()); return createRegOperand(TBA_HI);
+ case 110: assert(!isGFX9()); return createRegOperand(TMA_LO);
+ case 111: assert(!isGFX9()); return createRegOperand(TMA_HI);
case 124: return createRegOperand(M0);
case 126: return createRegOperand(EXEC_LO);
case 127: return createRegOperand(EXEC_HI);
@@ -615,11 +694,12 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
using namespace AMDGPU;
+
switch (Val) {
- case 102: return createRegOperand(getMCReg(FLAT_SCR, STI));
+ case 102: return createRegOperand(FLAT_SCR);
case 106: return createRegOperand(VCC);
- case 108: return createRegOperand(TBA);
- case 110: return createRegOperand(TMA);
+ case 108: assert(!isGFX9()); return createRegOperand(TBA);
+ case 110: assert(!isGFX9()); return createRegOperand(TMA);
case 126: return createRegOperand(EXEC);
default: break;
}
@@ -643,6 +723,11 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
return createSRegOperand(getSgprClassId(Width),
Val - SDWA9EncValues::SRC_SGPR_MIN);
}
+ if (SDWA9EncValues::SRC_TTMP_MIN <= Val &&
+ Val <= SDWA9EncValues::SRC_TTMP_MAX) {
+ return createSRegOperand(getTtmpClassId(Width),
+ Val - SDWA9EncValues::SRC_TTMP_MIN);
+ }
return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
} else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
@@ -659,7 +744,6 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
return decodeSDWASrc(OPW32, Val);
}
-
MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
using namespace AMDGPU::SDWA;
@@ -667,7 +751,11 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
"SDWAVopcDst should be present only on GFX9");
if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
- if (Val > AMDGPU::EncValues::SGPR_MAX) {
+
+ int TTmpIdx = getTTmpIdx(Val);
+ if (TTmpIdx >= 0) {
+ return createSRegOperand(getTtmpClassId(OPW64), TTmpIdx);
+ } else if (Val > AMDGPU::EncValues::SGPR_MAX) {
return decodeSpecialReg64(Val);
} else {
return createSRegOperand(getSgprClassId(OPW64), Val);
@@ -677,6 +765,14 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
}
}
+bool AMDGPUDisassembler::isVI() const {
+ return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
+}
+
+bool AMDGPUDisassembler::isGFX9() const {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
+}
+
//===----------------------------------------------------------------------===//
// AMDGPUSymbolizer
//===----------------------------------------------------------------------===//
@@ -686,8 +782,8 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
raw_ostream &/*cStream*/, int64_t Value,
uint64_t /*Address*/, bool IsBranch,
uint64_t /*Offset*/, uint64_t /*InstSize*/) {
- typedef std::tuple<uint64_t, StringRef, uint8_t> SymbolInfoTy;
- typedef std::vector<SymbolInfoTy> SectionSymbolsTy;
+ using SymbolInfoTy = std::tuple<uint64_t, StringRef, uint8_t>;
+ using SectionSymbolsTy = std::vector<SymbolInfoTy>;
if (!IsBranch) {
return false;
@@ -730,7 +826,7 @@ static MCSymbolizer *createAMDGPUSymbolizer(const Triple &/*TT*/,
static MCDisassembler *createAMDGPUDisassembler(const Target &T,
const MCSubtargetInfo &STI,
MCContext &Ctx) {
- return new AMDGPUDisassembler(STI, Ctx);
+ return new AMDGPUDisassembler(STI, Ctx, T.createMCInstrInfo());
}
extern "C" void LLVMInitializeAMDGPUDisassembler() {
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 4c755be09999..ce396eb68c4c 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA ---*- C++ -*--===//
+//===- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA -----*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -17,16 +17,18 @@
#define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
#include "llvm/MC/MCDisassembler/MCSymbolizer.h"
+
#include <algorithm>
#include <cstdint>
#include <memory>
namespace llvm {
-class MCContext;
class MCInst;
class MCOperand;
class MCSubtargetInfo;
@@ -38,13 +40,16 @@ class Twine;
class AMDGPUDisassembler : public MCDisassembler {
private:
+ std::unique_ptr<MCInstrInfo const> const MCII;
+ const MCRegisterInfo &MRI;
mutable ArrayRef<uint8_t> Bytes;
mutable uint32_t Literal;
mutable bool HasLiteral;
public:
- AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
- MCDisassembler(STI, Ctx) {}
+ AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+ MCInstrInfo const *MCII) :
+ MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()) {}
~AMDGPUDisassembler() override = default;
@@ -60,12 +65,11 @@ public:
MCOperand errOperand(unsigned V, const Twine& ErrMsg) const;
- DecodeStatus tryDecodeInst(const uint8_t* Table,
- MCInst &MI,
- uint64_t Inst,
- uint64_t Address) const;
+ DecodeStatus tryDecodeInst(const uint8_t* Table, MCInst &MI, uint64_t Inst,
+ uint64_t Address) const;
DecodeStatus convertSDWAInst(MCInst &MI) const;
+ DecodeStatus convertMIMGInst(MCInst &MI) const;
MCOperand decodeOperand_VGPR_32(unsigned Val) const;
MCOperand decodeOperand_VS_32(unsigned Val) const;
@@ -80,6 +84,7 @@ public:
MCOperand decodeOperand_SReg_32(unsigned Val) const;
MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const;
+ MCOperand decodeOperand_SReg_32_XEXEC_HI(unsigned Val) const;
MCOperand decodeOperand_SReg_64(unsigned Val) const;
MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const;
MCOperand decodeOperand_SReg_128(unsigned Val) const;
@@ -112,7 +117,12 @@ public:
MCOperand decodeSDWASrc16(unsigned Val) const;
MCOperand decodeSDWASrc32(unsigned Val) const;
MCOperand decodeSDWAVopcDst(unsigned Val) const;
-};
+
+ int getTTmpIdx(unsigned Val) const;
+
+ bool isVI() const;
+ bool isGFX9() const;
+ };
//===----------------------------------------------------------------------===//
// AMDGPUSymbolizer
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 5480110d8315..5e26f97b0c86 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -15,20 +15,28 @@
def isEG : Predicate<
"Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
- "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+ "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && "
"!Subtarget->hasCaymanISA()"
>;
def isEGorCayman : Predicate<
"Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||"
- "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::NORTHERN_ISLANDS"
>;
+class EGPat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
+ let SubtargetPredicate = isEG;
+}
+
+class EGOrCaymanPat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
+ let SubtargetPredicate = isEGorCayman;
+}
+
//===----------------------------------------------------------------------===//
// Evergreen / Cayman store instructions
//===----------------------------------------------------------------------===//
-let Predicates = [isEGorCayman] in {
+let SubtargetPredicate = isEGorCayman in {
class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins,
string name, list<dag> pattern>
@@ -88,13 +96,13 @@ defm RAT_ATOMIC_XOR : RAT_ATOMIC<16, 48, "ATOMIC_XOR">;
defm RAT_ATOMIC_INC_UINT : RAT_ATOMIC<18, 50, "ATOMIC_INC_UINT">;
defm RAT_ATOMIC_DEC_UINT : RAT_ATOMIC<19, 51, "ATOMIC_DEC_UINT">;
-} // End let Predicates = [isEGorCayman]
+} // End SubtargetPredicate = isEGorCayman
//===----------------------------------------------------------------------===//
// Evergreen Only instructions
//===----------------------------------------------------------------------===//
-let Predicates = [isEG] in {
+let SubtargetPredicate = isEG in {
def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
@@ -116,7 +124,8 @@ def SIN_eg : SIN_Common<0x8D>;
def COS_eg : COS_Common<0x8E>;
def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
-def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
+def : EGPat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
+} // End SubtargetPredicate = isEG
//===----------------------------------------------------------------------===//
// Memory read/write instructions
@@ -128,21 +137,21 @@ let usesCustomInserter = 1 in {
def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1,
(ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
"STORE_RAW $rw_gpr, $index_gpr, $eop",
- [(global_store i32:$rw_gpr, i32:$index_gpr)]
+ [(store_global i32:$rw_gpr, i32:$index_gpr)]
>;
// 64-bit store
def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3,
(ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
"STORE_RAW $rw_gpr.XY, $index_gpr, $eop",
- [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
+ [(store_global v2i32:$rw_gpr, i32:$index_gpr)]
>;
//128-bit store
def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
"STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop",
- [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
+ [(store_global v4i32:$rw_gpr, i32:$index_gpr)]
>;
def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>;
@@ -203,8 +212,8 @@ def VTX_READ_32_eg
// to be caused by ALU instructions in the next instruction group that wrote
// to the $src_gpr registers of the VTX_READ.
// e.g.
- // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
- // %T2_X<def> = MOV %ZERO
+ // %t3_x = VTX_READ_PARAM_32_eg killed %t2_x, 24
+ // %t2_x = MOV %zero
//Adding this constraint prevents this from happening.
let Constraints = "$src_gpr.ptr = $dst_gpr";
}
@@ -241,58 +250,56 @@ def VTX_READ_128_eg
//===----------------------------------------------------------------------===//
// VTX Read from parameter memory space
//===----------------------------------------------------------------------===//
-def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+def : EGPat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)),
(VTX_READ_8_eg MEMxi:$src_gpr, 3)>;
-def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+def : EGPat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)),
(VTX_READ_16_eg MEMxi:$src_gpr, 3)>;
-def : Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+def : EGPat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_32_eg MEMxi:$src_gpr, 3)>;
-def : Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+def : EGPat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_64_eg MEMxi:$src_gpr, 3)>;
-def : Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+def : EGPat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_128_eg MEMxi:$src_gpr, 3)>;
//===----------------------------------------------------------------------===//
// VTX Read from constant memory space
//===----------------------------------------------------------------------===//
-def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+def : EGPat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)),
(VTX_READ_8_eg MEMxi:$src_gpr, 2)>;
-def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+def : EGPat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)),
(VTX_READ_16_eg MEMxi:$src_gpr, 2)>;
-def : Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+def : EGPat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_32_eg MEMxi:$src_gpr, 2)>;
-def : Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+def : EGPat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_64_eg MEMxi:$src_gpr, 2)>;
-def : Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+def : EGPat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_128_eg MEMxi:$src_gpr, 2)>;
//===----------------------------------------------------------------------===//
// VTX Read from global memory space
//===----------------------------------------------------------------------===//
-def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+def : EGPat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)),
(VTX_READ_8_eg MEMxi:$src_gpr, 1)>;
-def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+def : EGPat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)),
(VTX_READ_16_eg MEMxi:$src_gpr, 1)>;
-def : Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+def : EGPat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_32_eg MEMxi:$src_gpr, 1)>;
-def : Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+def : EGPat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_64_eg MEMxi:$src_gpr, 1)>;
-def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+def : EGPat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
(VTX_READ_128_eg MEMxi:$src_gpr, 1)>;
-} // End Predicates = [isEG]
-
//===----------------------------------------------------------------------===//
// Evergreen / Cayman Instructions
//===----------------------------------------------------------------------===//
-let Predicates = [isEGorCayman] in {
+let SubtargetPredicate = isEGorCayman in {
multiclass AtomicPat<Instruction inst_ret, Instruction inst_noret,
SDPatternOperator node_ret, SDPatternOperator node_noret> {
// FIXME: Add _RTN version. We need per WI scratch location to store the old value
// EXTRACT_SUBREG here is dummy, we know the node has no uses
- def : Pat<(i32 (node_noret i32:$ptr, i32:$data)),
+ def : EGOrCaymanPat<(i32 (node_noret i32:$ptr, i32:$data)),
(EXTRACT_SUBREG (inst_noret
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), $data, sub0), $ptr), sub1)>;
}
@@ -300,7 +307,7 @@ multiclass AtomicIncDecPat<Instruction inst_ret, Instruction inst_noret,
SDPatternOperator node_ret, SDPatternOperator node_noret, int C> {
// FIXME: Add _RTN version. We need per WI scratch location to store the old value
// EXTRACT_SUBREG here is dummy, we know the node has no uses
- def : Pat<(i32 (node_noret i32:$ptr, C)),
+ def : EGOrCaymanPat<(i32 (node_noret i32:$ptr, C)),
(EXTRACT_SUBREG (inst_noret
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (MOV_IMM_I32 -1), sub0), $ptr), sub1)>;
}
@@ -308,7 +315,7 @@ multiclass AtomicIncDecPat<Instruction inst_ret, Instruction inst_noret,
// CMPSWAP is pattern is special
// EXTRACT_SUBREG here is dummy, we know the node has no uses
// FIXME: Add _RTN version. We need per WI scratch location to store the old value
-def : Pat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$data)),
+def : EGOrCaymanPat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$data)),
(EXTRACT_SUBREG (RAT_ATOMIC_CMPXCHG_INT_NORET
(INSERT_SUBREG
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), $cmp, sub3),
@@ -395,11 +402,11 @@ def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
VecALU
>;
-def : Pat<(i32 (sext_inreg i32:$src, i1)),
+def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i1)),
(BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>;
-def : Pat<(i32 (sext_inreg i32:$src, i8)),
+def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i8)),
(BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>;
-def : Pat<(i32 (sext_inreg i32:$src, i16)),
+def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i16)),
(BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>;
@@ -442,7 +449,7 @@ def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", AMDGPUfp_to_f16, V
def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>;
def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
-def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
+def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", AMDGPUffbl_b32, VecALU>;
let hasSideEffects = 1 in {
def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
@@ -614,7 +621,7 @@ def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >;
def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >;
def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >;
def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
- [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
+ [(store_local (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
>;
def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE",
[(truncstorei8_local i32:$src1, i32:$src0)]
@@ -653,10 +660,10 @@ def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG",
[(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))]
>;
def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST",
- [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))]
+ [(set i32:$dst, (atomic_cmp_swap_local i32:$src0, i32:$src1, i32:$src2))]
>;
def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
- [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
+ [(set (i32 R600_Reg32:$dst), (load_local R600_Reg32:$src0))]
>;
def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET",
[(set i32:$dst, (sextloadi8_local i32:$src0))]
@@ -681,9 +688,9 @@ def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
// XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
// which do not need to be truncated since the fp values are 0.0f or 1.0f.
// We should look into handling these cases separately.
-def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
+def : EGOrCaymanPat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
-def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
+def : EGOrCaymanPat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
// SHA-256 Patterns
def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index edca6fcd812c..693869128081 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -8,7 +8,10 @@
//===----------------------------------------------------------------------===//
def FLATAtomic : ComplexPattern<i64, 3, "SelectFlatAtomic", [], [], -10>;
-def FLATOffset : ComplexPattern<i64, 3, "SelectFlat", [], [], -10>;
+def FLATOffset : ComplexPattern<i64, 3, "SelectFlatOffset<false>", [], [], -10>;
+
+def FLATOffsetSigned : ComplexPattern<i64, 3, "SelectFlatOffset<true>", [], [], -10>;
+def FLATSignedAtomic : ComplexPattern<i64, 3, "SelectFlatAtomicSigned", [], [], -10>;
//===----------------------------------------------------------------------===//
// FLAT classes
@@ -22,14 +25,7 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
let isPseudo = 1;
let isCodeGenOnly = 1;
- let SubtargetPredicate = isCIVI;
-
let FLAT = 1;
- // Internally, FLAT instruction are executed as both an LDS and a
- // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
- // and are not considered done until both have been decremented.
- let VM_CNT = 1;
- let LGKM_CNT = 1;
let UseNamedOperandTable = 1;
let hasSideEffects = 0;
@@ -42,12 +38,32 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
bits<1> is_flat_scratch = 0;
bits<1> has_vdst = 1;
+
+ // We need to distinguish having saddr and enabling saddr because
+ // saddr is only valid for scratch and global instructions. Pre-gfx9
+ // these bits were reserved, so we also don't necessarily want to
+ // set these bits to the disabled value for the original flat
+ // segment instructions.
+ bits<1> has_saddr = 0;
+ bits<1> enabled_saddr = 0;
+ bits<7> saddr_value = 0;
+ bits<1> has_vaddr = 1;
+
bits<1> has_data = 1;
bits<1> has_glc = 1;
bits<1> glcValue = 0;
+ let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
+ !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
+
// TODO: M0 if it could possibly access LDS (before gfx9? only)?
let Uses = !if(is_flat_global, [EXEC], [EXEC, FLAT_SCR]);
+
+ // Internally, FLAT instruction are executed as both an LDS and a
+ // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
+ // and are not considered done until both have been decremented.
+ let VM_CNT = 1;
+ let LGKM_CNT = !if(!or(is_flat_global, is_flat_scratch), 0, 1);
}
class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
@@ -66,7 +82,9 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
// encoding fields
bits<8> vaddr;
bits<8> vdata;
+ bits<7> saddr;
bits<8> vdst;
+
bits<1> slc;
bits<1> glc;
@@ -94,56 +112,143 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
let Inst{17} = slc;
let Inst{24-18} = op;
let Inst{31-26} = 0x37; // Encoding.
- let Inst{39-32} = vaddr;
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
let Inst{47-40} = !if(ps.has_data, vdata, ?);
+ let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7f), 0);
+
// 54-48 is reserved.
let Inst{55} = nv; // nv on GFX9+, TFE before.
let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
}
+// TODO: Is exec allowed for saddr? The disabled value 0x7f is the
+// same encoding value as exec_hi, so it isn't possible to use that if
+// saddr is 32-bit (which isn't handled here yet).
class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
- bit HasSignedOffset = 0> : FLAT_Pseudo<
+ bit HasTiedOutput = 0,
+ bit HasSignedOffset = 0, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
(outs regClass:$vdst),
- !if(HasSignedOffset,
- (ins VReg_64:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc),
- (ins VReg_64:$vaddr, offset_u12:$offset, GLC:$glc, slc:$slc)),
- " $vdst, $vaddr$offset$glc$slc"> {
+ !con(
+ !con(
+ !con(
+ !con((ins VReg_64:$vaddr),
+ !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
+ (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
+ (ins GLC:$glc, slc:$slc)),
+ !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
+ " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
let has_data = 0;
let mayLoad = 1;
+ let has_saddr = HasSaddr;
+ let enabled_saddr = EnableSaddr;
+ let PseudoInstr = opName#!if(!and(HasSaddr, EnableSaddr), "_SADDR", "");
+ let maybeAtomic = 1;
+
+ let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
+ let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
+}
+
+class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
+ bit HasSignedOffset = 0, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
+ opName,
+ (outs),
+ !con(
+ !con(
+ !con((ins VReg_64:$vaddr, vdataClass:$vdata),
+ !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
+ (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
+ (ins GLC:$glc, slc:$slc)),
+ " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let has_vdst = 0;
+ let has_saddr = HasSaddr;
+ let enabled_saddr = EnableSaddr;
+ let PseudoInstr = opName#!if(!and(HasSaddr, EnableSaddr), "_SADDR", "");
+ let maybeAtomic = 1;
}
-class FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass> :
- FLAT_Load_Pseudo<opName, regClass, 1> {
- let is_flat_global = 1;
+multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
+ let is_flat_global = 1 in {
+ def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>;
+ def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>;
+ }
}
-class FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> :
- FLAT_Load_Pseudo<opName, regClass, 1> {
- let is_flat_scratch = 1;
+multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
+ let is_flat_global = 1 in {
+ def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>;
+ def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>;
+ }
}
-class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
- bit HasSignedOffset = 0> : FLAT_Pseudo<
+class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
+ bit EnableSaddr = 0>: FLAT_Pseudo<
+ opName,
+ (outs regClass:$vdst),
+ !if(EnableSaddr,
+ (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc),
+ (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)),
+ " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc"> {
+ let has_data = 0;
+ let mayLoad = 1;
+ let has_saddr = 1;
+ let enabled_saddr = EnableSaddr;
+ let has_vaddr = !if(EnableSaddr, 0, 1);
+ let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+ let maybeAtomic = 1;
+}
+
+class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
(outs),
- !if(HasSignedOffset,
- (ins VReg_64:$vaddr, vdataClass:$vdata, offset_s13:$offset, GLC:$glc, slc:$slc),
- (ins VReg_64:$vaddr, vdataClass:$vdata, offset_u12:$offset, GLC:$glc, slc:$slc)),
- " $vaddr, $vdata$offset$glc$slc"> {
+ !if(EnableSaddr,
+ (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc),
+ (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)),
+ " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc"> {
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
+ let has_saddr = 1;
+ let enabled_saddr = EnableSaddr;
+ let has_vaddr = !if(EnableSaddr, 0, 1);
+ let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+ let maybeAtomic = 1;
}
-class FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> :
- FLAT_Store_Pseudo<opName, regClass, 1> {
- let is_flat_global = 1;
+multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> {
+ let is_flat_scratch = 1 in {
+ def "" : FLAT_Scratch_Load_Pseudo<opName, regClass>;
+ def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, 1>;
+ }
}
-class FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> :
- FLAT_Store_Pseudo<opName, regClass, 1> {
- let is_flat_scratch = 1;
+multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> {
+ let is_flat_scratch = 1 in {
+ def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>;
+ def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>;
+ }
+}
+
+class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins,
+ string asm, list<dag> pattern = []> :
+ FLAT_Pseudo<opName, outs, ins, asm, pattern> {
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_glc = 0;
+ let glcValue = 0;
+ let has_vdst = 0;
+ let maybeAtomic = 1;
+}
+
+class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins,
+ string asm, list<dag> pattern = []>
+ : FLAT_AtomicNoRet_Pseudo<opName, outs, ins, asm, pattern> {
+ let hasPostISelHook = 1;
+ let has_vdst = 1;
+ let glcValue = 1;
+ let PseudoInstr = NAME # "_RTN";
}
multiclass FLAT_Atomic_Pseudo<
@@ -152,40 +257,69 @@ multiclass FLAT_Atomic_Pseudo<
ValueType vt,
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc,
- bit HasSignedOffset = 0> {
-
- def "" : FLAT_Pseudo <opName,
+ RegisterClass data_rc = vdst_rc> {
+ def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- !if(HasSignedOffset,
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc)),
- " $vaddr, $vdata$offset$slc",
- []>,
- AtomicNoRet <NAME, 0> {
- let mayLoad = 1;
- let mayStore = 1;
- let has_glc = 0;
- let glcValue = 0;
- let has_vdst = 0;
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc),
+ " $vaddr, $vdata$offset$slc">,
+ AtomicNoRet <opName, 0> {
let PseudoInstr = NAME;
}
- def _RTN : FLAT_Pseudo <opName,
+ def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- !if(HasSignedOffset,
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc)),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc),
" $vdst, $vaddr, $vdata$offset glc$slc",
[(set vt:$vdst,
(atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
- AtomicNoRet <NAME, 1> {
- let mayLoad = 1;
- let mayStore = 1;
- let hasPostISelHook = 1;
- let has_glc = 0;
- let glcValue = 1;
- let PseudoInstr = NAME # "_RTN";
+ AtomicNoRet <opName, 1>;
+}
+
+multiclass FLAT_Global_Atomic_Pseudo<
+ string opName,
+ RegisterClass vdst_rc,
+ ValueType vt,
+ SDPatternOperator atomic = null_frag,
+ ValueType data_vt = vt,
+ RegisterClass data_rc = vdst_rc> {
+
+ def "" : FLAT_AtomicNoRet_Pseudo <opName,
+ (outs),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+ " $vaddr, $vdata, off$offset$slc">,
+ AtomicNoRet <opName, 0> {
+ let has_saddr = 1;
+ let PseudoInstr = NAME;
+ }
+
+ def _RTN : FLAT_AtomicRet_Pseudo <opName,
+ (outs vdst_rc:$vdst),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+ " $vdst, $vaddr, $vdata, off$offset glc$slc",
+ [(set vt:$vdst,
+ (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+ AtomicNoRet <opName, 1> {
+ let has_saddr = 1;
+ }
+
+ def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
+ (outs),
+ (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc),
+ " $vaddr, $vdata, $saddr$offset$slc">,
+ AtomicNoRet <opName#"_saddr", 0> {
+ let has_saddr = 1;
+ let enabled_saddr = 1;
+ let PseudoInstr = NAME#"_SADDR";
+ }
+
+ def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
+ (outs vdst_rc:$vdst),
+ (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc),
+ " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
+ AtomicNoRet <opName#"_saddr", 1> {
+ let has_saddr = 1;
+ let enabled_saddr = 1;
+ let PseudoInstr = NAME#"_SADDR_RTN";
}
}
@@ -231,6 +365,18 @@ def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
+let SubtargetPredicate = HasD16LoadStore in {
+def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32, 1>;
+def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
+def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32, 1>;
+def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
+def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32, 1>;
+def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
+
+def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
+def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
+}
+
defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
VGPR_32, i32, atomic_cmp_swap_flat,
v2i32, VReg_64>;
@@ -334,108 +480,274 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
} // End SubtargetPredicate = isCI
let SubtargetPredicate = HasFlatGlobalInsts in {
-def GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
-def GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
-def GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
-def GLOBAL_LOAD_SSHORT : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>;
-def GLOBAL_LOAD_DWORD : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>;
-def GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>;
-def GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>;
-def GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>;
-
-def GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
-def GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
-def GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>;
-def GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>;
-def GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
-def GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
+defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
+defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
+defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
+defm GLOBAL_LOAD_SSHORT : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>;
+defm GLOBAL_LOAD_DWORD : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>;
+defm GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>;
+defm GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>;
+defm GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>;
+
+defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16", VGPR_32, 1>;
+defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", VGPR_32, 1>;
+defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16", VGPR_32, 1>;
+defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>;
+defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>;
+defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>;
+
+defm GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
+defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
+defm GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>;
+defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>;
+defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
+defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
+
+defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>;
+defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>;
+
+let is_flat_global = 1 in {
+defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap",
+ VGPR_32, i32, AMDGPUatomic_cmp_swap_global,
+ v2i32, VReg_64>;
+
+defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap_x2",
+ VReg_64, i64, AMDGPUatomic_cmp_swap_global,
+ v2i64, VReg_128>;
+
+defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap",
+ VGPR_32, i32, atomic_swap_global>;
+
+defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_swap_x2",
+ VReg_64, i64, atomic_swap_global>;
+
+defm GLOBAL_ATOMIC_ADD : FLAT_Global_Atomic_Pseudo <"global_atomic_add",
+ VGPR_32, i32, atomic_add_global>;
+
+defm GLOBAL_ATOMIC_SUB : FLAT_Global_Atomic_Pseudo <"global_atomic_sub",
+ VGPR_32, i32, atomic_sub_global>;
+
+defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_smin",
+ VGPR_32, i32, atomic_min_global>;
+
+defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_umin",
+ VGPR_32, i32, atomic_umin_global>;
+
+defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_smax",
+ VGPR_32, i32, atomic_max_global>;
+
+defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_umax",
+ VGPR_32, i32, atomic_umax_global>;
+
+defm GLOBAL_ATOMIC_AND : FLAT_Global_Atomic_Pseudo <"global_atomic_and",
+ VGPR_32, i32, atomic_and_global>;
+
+defm GLOBAL_ATOMIC_OR : FLAT_Global_Atomic_Pseudo <"global_atomic_or",
+ VGPR_32, i32, atomic_or_global>;
+
+defm GLOBAL_ATOMIC_XOR : FLAT_Global_Atomic_Pseudo <"global_atomic_xor",
+ VGPR_32, i32, atomic_xor_global>;
+
+defm GLOBAL_ATOMIC_INC : FLAT_Global_Atomic_Pseudo <"global_atomic_inc",
+ VGPR_32, i32, atomic_inc_global>;
+
+defm GLOBAL_ATOMIC_DEC : FLAT_Global_Atomic_Pseudo <"global_atomic_dec",
+ VGPR_32, i32, atomic_dec_global>;
+
+defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_add_x2",
+ VReg_64, i64, atomic_add_global>;
+
+defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_sub_x2",
+ VReg_64, i64, atomic_sub_global>;
+
+defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smin_x2",
+ VReg_64, i64, atomic_min_global>;
+
+defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umin_x2",
+ VReg_64, i64, atomic_umin_global>;
+
+defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smax_x2",
+ VReg_64, i64, atomic_max_global>;
+
+defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umax_x2",
+ VReg_64, i64, atomic_umax_global>;
+
+defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_and_x2",
+ VReg_64, i64, atomic_and_global>;
+
+defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_or_x2",
+ VReg_64, i64, atomic_or_global>;
+
+defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_xor_x2",
+ VReg_64, i64, atomic_xor_global>;
+
+defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2",
+ VReg_64, i64, atomic_inc_global>;
+
+defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
+ VReg_64, i64, atomic_dec_global>;
+} // End is_flat_global = 1
} // End SubtargetPredicate = HasFlatGlobalInsts
+let SubtargetPredicate = HasFlatScratchInsts in {
+defm SCRATCH_LOAD_UBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte", VGPR_32>;
+defm SCRATCH_LOAD_SBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte", VGPR_32>;
+defm SCRATCH_LOAD_USHORT : FLAT_Scratch_Load_Pseudo <"scratch_load_ushort", VGPR_32>;
+defm SCRATCH_LOAD_SSHORT : FLAT_Scratch_Load_Pseudo <"scratch_load_sshort", VGPR_32>;
+defm SCRATCH_LOAD_DWORD : FLAT_Scratch_Load_Pseudo <"scratch_load_dword", VGPR_32>;
+defm SCRATCH_LOAD_DWORDX2 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx2", VReg_64>;
+defm SCRATCH_LOAD_DWORDX3 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx3", VReg_96>;
+defm SCRATCH_LOAD_DWORDX4 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx4", VReg_128>;
+
+defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16", VGPR_32>;
+defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", VGPR_32>;
+defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16", VGPR_32>;
+defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", VGPR_32>;
+defm SCRATCH_LOAD_SHORT_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16", VGPR_32>;
+defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", VGPR_32>;
+
+defm SCRATCH_STORE_BYTE : FLAT_Scratch_Store_Pseudo <"scratch_store_byte", VGPR_32>;
+defm SCRATCH_STORE_SHORT : FLAT_Scratch_Store_Pseudo <"scratch_store_short", VGPR_32>;
+defm SCRATCH_STORE_DWORD : FLAT_Scratch_Store_Pseudo <"scratch_store_dword", VGPR_32>;
+defm SCRATCH_STORE_DWORDX2 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx2", VReg_64>;
+defm SCRATCH_STORE_DWORDX3 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx3", VReg_96>;
+defm SCRATCH_STORE_DWORDX4 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx4", VReg_128>;
+
+defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi", VGPR_32>;
+defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi", VGPR_32>;
+
+} // End SubtargetPredicate = HasFlatScratchInsts
+
//===----------------------------------------------------------------------===//
// Flat Patterns
//===----------------------------------------------------------------------===//
-class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
- (ld node:$ptr), [{
- auto const AS = cast<MemSDNode>(N)->getAddressSpace();
- return AS == AMDGPUASI.FLAT_ADDRESS ||
- AS == AMDGPUASI.GLOBAL_ADDRESS ||
- AS == AMDGPUASI.CONSTANT_ADDRESS;
-}]>;
-
-class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
- (st node:$val, node:$ptr), [{
- auto const AS = cast<MemSDNode>(N)->getAddressSpace();
- return AS == AMDGPUASI.FLAT_ADDRESS ||
- AS == AMDGPUASI.GLOBAL_ADDRESS;
-}]>;
-
-def atomic_flat_load : flat_ld <atomic_load>;
-def flat_load : flat_ld <load>;
-def flat_az_extloadi8 : flat_ld <az_extloadi8>;
-def flat_sextloadi8 : flat_ld <sextloadi8>;
-def flat_az_extloadi16 : flat_ld <az_extloadi16>;
-def flat_sextloadi16 : flat_ld <sextloadi16>;
-
-def atomic_flat_store : flat_st <atomic_store>;
-def flat_store : flat_st <store>;
-def flat_truncstorei8 : flat_st <truncstorei8>;
-def flat_truncstorei16 : flat_st <truncstorei16>;
-
// Patterns for global loads with no offset.
-class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
- (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))),
+class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))),
(inst $vaddr, $offset, 0, $slc)
>;
-class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+multiclass FlatLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
+ def : GCNPat <
+ (build_vector vt:$elt0, (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))),
+ (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
+ >;
+
+ def : GCNPat <
+ (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))))),
+ (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
+ >;
+}
+
+multiclass FlatSignedLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
+ def : GCNPat <
+ (build_vector vt:$elt0, (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))),
+ (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
+ >;
+
+ def : GCNPat <
+ (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))))),
+ (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
+ >;
+}
+
+multiclass FlatLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
+ def : GCNPat <
+ (build_vector (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
+ (v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
+ >;
+
+ def : GCNPat <
+ (build_vector (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
+ (v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
+ >;
+}
+
+multiclass FlatSignedLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
+ def : GCNPat <
+ (build_vector (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
+ (v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
+ >;
+
+ def : GCNPat <
+ (build_vector (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
+ (v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
+ >;
+}
+
+class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))),
- (inst $vaddr, $offset, 1, $slc)
+ (inst $vaddr, $offset, 0, $slc)
>;
-class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
- (node vt:$data, (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc)),
+class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))),
+ (inst $vaddr, $offset, 0, $slc)
+>;
+
+class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node vt:$data, (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)),
(inst $vaddr, $data, $offset, 0, $slc)
>;
-class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node vt:$data, (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)),
+ (inst $vaddr, $data, $offset, 0, $slc)
+>;
+
+class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
// atomic store follows atomic binop convention so the address comes
// first.
(node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
- (inst $vaddr, $data, $offset, 1, $slc)
+ (inst $vaddr, $data, $offset, 0, $slc)
+>;
+
+class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ // atomic store follows atomic binop convention so the address comes
+ // first.
+ (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
+ (inst $vaddr, $data, $offset, 0, $slc)
>;
class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
- ValueType data_vt = vt> : Pat <
+ ValueType data_vt = vt> : GCNPat <
(vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)),
(inst $vaddr, $data, $offset, $slc)
>;
-let Predicates = [isCIVI] in {
+class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
+ ValueType data_vt = vt> : GCNPat <
+ (vt (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)),
+ (inst $vaddr, $data, $offset, $slc)
+>;
+
+let OtherPredicates = [HasFlatAddressSpace] in {
-def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i16>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i16>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_flat, i16>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, v4i32>;
-def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_flat_load, i32>;
-def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_flat_load, i64>;
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_load_flat, i32>;
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_flat, i64>;
-def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>;
-def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>;
+def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, store_flat, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32>;
-def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_flat_store, i32>;
-def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_flat_store, i64>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat, i64>;
def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
@@ -465,13 +777,100 @@ def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
-} // End Predicates = [isCIVI]
+def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
-let Predicates = [isVI] in {
- def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i16>;
- def : FlatStorePat <FLAT_STORE_SHORT, flat_store, i16>;
+let OtherPredicates = [HasD16LoadStore] in {
+def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
+def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
+
+let AddedComplexity = 3 in {
+defm : FlatLoadPat_Hi16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_flat>;
+defm : FlatLoadPat_Hi16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_flat>;
+defm : FlatLoadPat_Hi16 <FLAT_LOAD_SHORT_D16_HI, load_flat>;
}
+let AddedComplexity = 9 in {
+defm : FlatLoadPat_Lo16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_flat>;
+defm : FlatLoadPat_Lo16 <FLAT_LOAD_SBYTE_D16, sextloadi8_flat>;
+defm : FlatLoadPat_Lo16 <FLAT_LOAD_SHORT_D16, load_flat>;
+}
+}
+
+} // End OtherPredicates = [HasFlatAddressSpace]
+
+let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in {
+
+def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, az_extloadi8_global, i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, az_extloadi8_global, i16>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, az_extloadi16_global, i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, load_global, i16>;
+
+def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, v2i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX4, load_global, v4i32>;
+
+def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORD, atomic_load_global, i32>;
+def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORDX2, atomic_load_global, i64>;
+
+def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16>;
+def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>;
+
+let OtherPredicates = [HasD16LoadStore] in {
+def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
+
+defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_global>;
+defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_global>;
+defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SHORT_D16_HI, load_global>;
+
+defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_global>;
+defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_global>;
+defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SHORT_D16, load_global>;
+
+}
+
+def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>;
+def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64>;
+
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_add_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_and_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_or_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_and_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_or_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
+
+} // End OtherPredicates = [HasFlatGlobalInsts]
+
//===----------------------------------------------------------------------===//
// Target
@@ -556,6 +955,11 @@ class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps> :
let DecoderNamespace="VI";
}
+multiclass FLAT_Real_AllAddr_vi<bits<7> op> {
+ def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)>;
+ def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+}
+
def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
@@ -566,17 +970,33 @@ def FLAT_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, FLAT_LOAD_DWORDX4>;
def FLAT_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, FLAT_LOAD_DWORDX3>;
def FLAT_STORE_BYTE_vi : FLAT_Real_vi <0x18, FLAT_STORE_BYTE>;
+def FLAT_STORE_BYTE_D16_HI_vi : FLAT_Real_vi <0x19, FLAT_STORE_BYTE_D16_HI>;
def FLAT_STORE_SHORT_vi : FLAT_Real_vi <0x1a, FLAT_STORE_SHORT>;
+def FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_vi <0x1b, FLAT_STORE_SHORT_D16_HI>;
def FLAT_STORE_DWORD_vi : FLAT_Real_vi <0x1c, FLAT_STORE_DWORD>;
def FLAT_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, FLAT_STORE_DWORDX2>;
def FLAT_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, FLAT_STORE_DWORDX4>;
def FLAT_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, FLAT_STORE_DWORDX3>;
+def FLAT_LOAD_UBYTE_D16_vi : FLAT_Real_vi <0x20, FLAT_LOAD_UBYTE_D16>;
+def FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>;
+def FLAT_LOAD_SBYTE_D16_vi : FLAT_Real_vi <0x22, FLAT_LOAD_SBYTE_D16>;
+def FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
+def FLAT_LOAD_SHORT_D16_vi : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>;
+def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
+
multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps> {
def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
}
+multiclass FLAT_Global_Real_Atomics_vi<bits<7> op> :
+ FLAT_Real_AllAddr_vi<op> {
+ def _RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN")>;
+ def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>;
+}
+
+
defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_vi <0x40, FLAT_ATOMIC_SWAP>;
defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_vi <0x41, FLAT_ATOMIC_CMPSWAP>;
defm FLAT_ATOMIC_ADD : FLAT_Real_Atomics_vi <0x42, FLAT_ATOMIC_ADD>;
@@ -604,18 +1024,78 @@ defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>;
defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>;
defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>;
-def GLOBAL_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, GLOBAL_LOAD_UBYTE>;
-def GLOBAL_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, GLOBAL_LOAD_SBYTE>;
-def GLOBAL_LOAD_USHORT_vi : FLAT_Real_vi <0x12, GLOBAL_LOAD_USHORT>;
-def GLOBAL_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, GLOBAL_LOAD_SSHORT>;
-def GLOBAL_LOAD_DWORD_vi : FLAT_Real_vi <0x14, GLOBAL_LOAD_DWORD>;
-def GLOBAL_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, GLOBAL_LOAD_DWORDX2>;
-def GLOBAL_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, GLOBAL_LOAD_DWORDX4>;
-def GLOBAL_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, GLOBAL_LOAD_DWORDX3>;
-
-def GLOBAL_STORE_BYTE_vi : FLAT_Real_vi <0x18, GLOBAL_STORE_BYTE>;
-def GLOBAL_STORE_SHORT_vi : FLAT_Real_vi <0x1a, GLOBAL_STORE_SHORT>;
-def GLOBAL_STORE_DWORD_vi : FLAT_Real_vi <0x1c, GLOBAL_STORE_DWORD>;
-def GLOBAL_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, GLOBAL_STORE_DWORDX2>;
-def GLOBAL_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, GLOBAL_STORE_DWORDX4>;
-def GLOBAL_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, GLOBAL_STORE_DWORDX3>;
+defm GLOBAL_LOAD_UBYTE : FLAT_Real_AllAddr_vi <0x10>;
+defm GLOBAL_LOAD_SBYTE : FLAT_Real_AllAddr_vi <0x11>;
+defm GLOBAL_LOAD_USHORT : FLAT_Real_AllAddr_vi <0x12>;
+defm GLOBAL_LOAD_SSHORT : FLAT_Real_AllAddr_vi <0x13>;
+defm GLOBAL_LOAD_DWORD : FLAT_Real_AllAddr_vi <0x14>;
+defm GLOBAL_LOAD_DWORDX2 : FLAT_Real_AllAddr_vi <0x15>;
+defm GLOBAL_LOAD_DWORDX3 : FLAT_Real_AllAddr_vi <0x16>;
+defm GLOBAL_LOAD_DWORDX4 : FLAT_Real_AllAddr_vi <0x17>;
+
+defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_vi <0x20>;
+defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_vi <0x21>;
+defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_vi <0x22>;
+defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_vi <0x23>;
+defm GLOBAL_LOAD_SHORT_D16 : FLAT_Real_AllAddr_vi <0x24>;
+defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_vi <0x25>;
+
+defm GLOBAL_STORE_BYTE : FLAT_Real_AllAddr_vi <0x18>;
+defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_vi <0x19>;
+defm GLOBAL_STORE_SHORT : FLAT_Real_AllAddr_vi <0x1a>;
+defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_vi <0x1b>;
+defm GLOBAL_STORE_DWORD : FLAT_Real_AllAddr_vi <0x1c>;
+defm GLOBAL_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>;
+defm GLOBAL_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>;
+defm GLOBAL_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>;
+
+
+defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Real_Atomics_vi <0x40>;
+defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Real_Atomics_vi <0x41>;
+defm GLOBAL_ATOMIC_ADD : FLAT_Global_Real_Atomics_vi <0x42>;
+defm GLOBAL_ATOMIC_SUB : FLAT_Global_Real_Atomics_vi <0x43>;
+defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Real_Atomics_vi <0x44>;
+defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Real_Atomics_vi <0x45>;
+defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Real_Atomics_vi <0x46>;
+defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Real_Atomics_vi <0x47>;
+defm GLOBAL_ATOMIC_AND : FLAT_Global_Real_Atomics_vi <0x48>;
+defm GLOBAL_ATOMIC_OR : FLAT_Global_Real_Atomics_vi <0x49>;
+defm GLOBAL_ATOMIC_XOR : FLAT_Global_Real_Atomics_vi <0x4a>;
+defm GLOBAL_ATOMIC_INC : FLAT_Global_Real_Atomics_vi <0x4b>;
+defm GLOBAL_ATOMIC_DEC : FLAT_Global_Real_Atomics_vi <0x4c>;
+defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Real_Atomics_vi <0x60>;
+defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Real_Atomics_vi <0x61>;
+defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Real_Atomics_vi <0x62>;
+defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Real_Atomics_vi <0x63>;
+defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Real_Atomics_vi <0x64>;
+defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Real_Atomics_vi <0x65>;
+defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Real_Atomics_vi <0x66>;
+defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Real_Atomics_vi <0x67>;
+defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Real_Atomics_vi <0x68>;
+defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Real_Atomics_vi <0x69>;
+defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Real_Atomics_vi <0x6a>;
+defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Real_Atomics_vi <0x6b>;
+defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Real_Atomics_vi <0x6c>;
+
+defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_vi <0x10>;
+defm SCRATCH_LOAD_SBYTE : FLAT_Real_AllAddr_vi <0x11>;
+defm SCRATCH_LOAD_USHORT : FLAT_Real_AllAddr_vi <0x12>;
+defm SCRATCH_LOAD_SSHORT : FLAT_Real_AllAddr_vi <0x13>;
+defm SCRATCH_LOAD_DWORD : FLAT_Real_AllAddr_vi <0x14>;
+defm SCRATCH_LOAD_DWORDX2 : FLAT_Real_AllAddr_vi <0x15>;
+defm SCRATCH_LOAD_DWORDX3 : FLAT_Real_AllAddr_vi <0x16>;
+defm SCRATCH_LOAD_DWORDX4 : FLAT_Real_AllAddr_vi <0x17>;
+defm SCRATCH_STORE_BYTE : FLAT_Real_AllAddr_vi <0x18>;
+defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_vi <0x19>;
+defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_vi <0x20>;
+defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_vi <0x21>;
+defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_vi <0x22>;
+defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_vi <0x23>;
+defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_AllAddr_vi <0x24>;
+defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_vi <0x25>;
+defm SCRATCH_STORE_SHORT : FLAT_Real_AllAddr_vi <0x1a>;
+defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_vi <0x1b>;
+defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_vi <0x1c>;
+defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>;
+defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>;
+defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>;
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 025397b1eac0..dd515b0bf2f1 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -40,7 +40,10 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
CurrCycleInstr(nullptr),
MF(MF),
ST(MF.getSubtarget<SISubtarget>()),
- TII(*ST.getInstrInfo()) {
+ TII(*ST.getInstrInfo()),
+ TRI(TII.getRegisterInfo()),
+ ClauseUses(TRI.getNumRegUnits()),
+ ClauseDefs(TRI.getNumRegUnits()) {
MaxLookAhead = 5;
}
@@ -84,6 +87,18 @@ static bool isSMovRel(unsigned Opcode) {
}
}
+static bool isSendMsgTraceDataOrGDS(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_SENDMSG:
+ case AMDGPU::S_SENDMSGHALT:
+ case AMDGPU::S_TTRACEDATA:
+ return true;
+ default:
+ // TODO: GDS
+ return false;
+ }
+}
+
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
AMDGPU::OpName::simm16);
@@ -97,7 +112,10 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
return NoopHazard;
- if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
+ // FIXME: Should flat be considered vmem?
+ if ((SIInstrInfo::isVMEM(*MI) ||
+ SIInstrInfo::isFLAT(*MI))
+ && checkVMEMHazards(MI) > 0)
return NoopHazard;
if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
@@ -121,10 +139,18 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
return NoopHazard;
- if ((TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
+ if (ST.hasReadM0MovRelInterpHazard() &&
+ (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
checkReadM0Hazards(MI) > 0)
return NoopHazard;
+ if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI) &&
+ checkReadM0Hazards(MI) > 0)
+ return NoopHazard;
+
+ if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
+ return NoopHazard;
+
if (checkAnyInstHazards(MI) > 0)
return NoopHazard;
@@ -141,26 +167,23 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
if (SIInstrInfo::isSMRD(*MI))
return std::max(WaitStates, checkSMRDHazards(MI));
- if (SIInstrInfo::isVALU(*MI)) {
- WaitStates = std::max(WaitStates, checkVALUHazards(MI));
-
- if (SIInstrInfo::isVMEM(*MI))
- WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
+ if (SIInstrInfo::isVALU(*MI))
+ WaitStates = std::max(WaitStates, checkVALUHazards(MI));
- if (SIInstrInfo::isDPP(*MI))
- WaitStates = std::max(WaitStates, checkDPPHazards(MI));
+ if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
+ WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
- if (isDivFMas(MI->getOpcode()))
- WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
+ if (SIInstrInfo::isDPP(*MI))
+ WaitStates = std::max(WaitStates, checkDPPHazards(MI));
- if (isRWLane(MI->getOpcode()))
- WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
+ if (isDivFMas(MI->getOpcode()))
+ WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
- if (TII.isVINTRP(*MI))
- WaitStates = std::max(WaitStates, checkReadM0Hazards(MI));
+ if (isRWLane(MI->getOpcode()))
+ WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
- return WaitStates;
- }
+ if (MI->isInlineAsm())
+ return std::max(WaitStates, checkInlineAsmHazards(MI));
if (isSGetReg(MI->getOpcode()))
return std::max(WaitStates, checkGetRegHazards(MI));
@@ -171,7 +194,11 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
if (isRFE(MI->getOpcode()))
return std::max(WaitStates, checkRFEHazards(MI));
- if (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))
+ if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
+ isSMovRel(MI->getOpcode())))
+ return std::max(WaitStates, checkReadM0Hazards(MI));
+
+ if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI))
return std::max(WaitStates, checkReadM0Hazards(MI));
return WaitStates;
@@ -225,7 +252,8 @@ int GCNHazardRecognizer::getWaitStatesSince(
return WaitStates;
unsigned Opcode = MI->getOpcode();
- if (Opcode == AMDGPU::DBG_VALUE || Opcode == AMDGPU::IMPLICIT_DEF)
+ if (Opcode == AMDGPU::DBG_VALUE || Opcode == AMDGPU::IMPLICIT_DEF ||
+ Opcode == AMDGPU::INLINEASM)
continue;
}
++WaitStates;
@@ -257,19 +285,37 @@ int GCNHazardRecognizer::getWaitStatesSinceSetReg(
// No-op Hazard Detection
//===----------------------------------------------------------------------===//
-static void addRegsToSet(iterator_range<MachineInstr::const_mop_iterator> Ops,
- std::set<unsigned> &Set) {
+static void addRegUnits(const SIRegisterInfo &TRI,
+ BitVector &BV, unsigned Reg) {
+ for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
+ BV.set(*RUI);
+}
+
+static void addRegsToSet(const SIRegisterInfo &TRI,
+ iterator_range<MachineInstr::const_mop_iterator> Ops,
+ BitVector &Set) {
for (const MachineOperand &Op : Ops) {
if (Op.isReg())
- Set.insert(Op.getReg());
+ addRegUnits(TRI, Set, Op.getReg());
}
}
-int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
- // SMEM soft clause are only present on VI+
- if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
+ // XXX: Do we need to worry about implicit operands
+ addRegsToSet(TRI, MI.defs(), ClauseDefs);
+ addRegsToSet(TRI, MI.uses(), ClauseUses);
+}
+
+int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
+ // SMEM soft clause are only present on VI+, and only matter if xnack is
+ // enabled.
+ if (!ST.isXNACKEnabled())
return 0;
+ bool IsSMRD = TII.isSMRD(*MEM);
+
+ resetClause();
+
// A soft-clause is any group of consecutive SMEM instructions. The
// instructions in this group may return out of order and/or may be
// replayed (i.e. the same instruction issued more than once).
@@ -280,51 +326,39 @@ int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
// (including itself). If we encounter this situaion, we need to break the
// clause by inserting a non SMEM instruction.
- std::set<unsigned> ClauseDefs;
- std::set<unsigned> ClauseUses;
-
for (MachineInstr *MI : EmittedInstrs) {
-
// When we hit a non-SMEM instruction then we have passed the start of the
// clause and we can stop.
- if (!MI || !SIInstrInfo::isSMRD(*MI))
+ if (!MI)
break;
- addRegsToSet(MI->defs(), ClauseDefs);
- addRegsToSet(MI->uses(), ClauseUses);
+ if (IsSMRD != SIInstrInfo::isSMRD(*MI))
+ break;
+
+ addClauseInst(*MI);
}
- if (ClauseDefs.empty())
+ if (ClauseDefs.none())
return 0;
- // FIXME: When we support stores, we need to make sure not to put loads and
- // stores in the same clause if they use the same address. For now, just
- // start a new clause whenever we see a store.
- if (SMEM->mayStore())
+ // We need to make sure not to put loads and stores in the same clause if they
+ // use the same address. For now, just start a new clause whenever we see a
+ // store.
+ if (MEM->mayStore())
return 1;
- addRegsToSet(SMEM->defs(), ClauseDefs);
- addRegsToSet(SMEM->uses(), ClauseUses);
-
- std::vector<unsigned> Result(std::max(ClauseDefs.size(), ClauseUses.size()));
- std::vector<unsigned>::iterator End;
-
- End = std::set_intersection(ClauseDefs.begin(), ClauseDefs.end(),
- ClauseUses.begin(), ClauseUses.end(), Result.begin());
+ addClauseInst(*MEM);
// If the set of defs and uses intersect then we cannot add this instruction
// to the clause, so we have a hazard.
- if (End != Result.begin())
- return 1;
-
- return 0;
+ return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
}
int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
int WaitStatesNeeded = 0;
- WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD);
+ WaitStatesNeeded = checkSoftClauseHazards(SMRD);
// This SMRD hazard only affects SI.
if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS)
@@ -334,6 +368,9 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
// SGPR was written by a VALU instruction.
int SmrdSgprWaitStates = 4;
auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
+ auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
+
+ bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
for (const MachineOperand &Use : SMRD->uses()) {
if (!Use.isReg())
@@ -341,23 +378,35 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
int WaitStatesNeededForUse =
SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ // This fixes what appears to be undocumented hardware behavior in SI where
+ // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
+ // needs some number of nops in between. We don't know how many we need, but
+ // let's use 4. This wasn't discovered before probably because the only
+ // case when this happens is when we expand a 64-bit pointer into a full
+ // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
+ // probably never encountered in the closed-source land.
+ if (IsBufferSMRD) {
+ int WaitStatesNeededForUse =
+ SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
+ IsBufferHazardDefFn);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
}
+
return WaitStatesNeeded;
}
int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
- const SIInstrInfo *TII = ST.getInstrInfo();
-
if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
return 0;
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
// A read of an SGPR by a VMEM instruction requires 5 wait states when the
// SGPR was written by a VALU Instruction.
- int VmemSgprWaitStates = 5;
- int WaitStatesNeeded = 0;
- auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+ const int VmemSgprWaitStates = 5;
+ auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
for (const MachineOperand &Use : VMEM->uses()) {
if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
@@ -372,10 +421,13 @@ int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
- // Check for DPP VGPR read after VALU VGPR write.
+ // Check for DPP VGPR read after VALU VGPR write and EXEC write.
int DppVgprWaitStates = 2;
+ int DppExecWaitStates = 5;
int WaitStatesNeeded = 0;
+ auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
for (const MachineOperand &Use : DPP->uses()) {
if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
@@ -385,6 +437,10 @@ int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
+ WaitStatesNeeded = std::max(
+ WaitStatesNeeded,
+ DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn));
+
return WaitStatesNeeded;
}
@@ -475,39 +531,76 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
return -1;
}
+int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
+ const MachineRegisterInfo &MRI) {
+ // Helper to check for the hazard where VMEM instructions that store more than
+ // 8 bytes can have there store data over written by the next instruction.
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ const int VALUWaitStates = 1;
+ int WaitStatesNeeded = 0;
+
+ if (!TRI->isVGPR(MRI, Def.getReg()))
+ return WaitStatesNeeded;
+ unsigned Reg = Def.getReg();
+ auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
+ int DataIdx = createsVALUHazard(*MI);
+ return DataIdx >= 0 &&
+ TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
+ };
+ int WaitStatesNeededForDef =
+ VALUWaitStates - getWaitStatesSince(IsHazardFn);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+
+ return WaitStatesNeeded;
+}
+
int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
// This checks for the hazard where VMEM instructions that store more than
// 8 bytes can have there store data over written by the next instruction.
if (!ST.has12DWordStoreHazard())
return 0;
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const MachineRegisterInfo &MRI = VALU->getParent()->getParent()->getRegInfo();
-
- const int VALUWaitStates = 1;
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
int WaitStatesNeeded = 0;
for (const MachineOperand &Def : VALU->defs()) {
- if (!TRI->isVGPR(MRI, Def.getReg()))
- continue;
- unsigned Reg = Def.getReg();
- auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
- int DataIdx = createsVALUHazard(*MI);
- return DataIdx >= 0 &&
- TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
- };
- int WaitStatesNeededForDef =
- VALUWaitStates - getWaitStatesSince(IsHazardFn);
- WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
+ }
+
+ return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
+ // This checks for hazards associated with inline asm statements.
+ // Since inline asms can contain just about anything, we use this
+ // to call/leverage other check*Hazard routines. Note that
+ // this function doesn't attempt to address all possible inline asm
+ // hazards (good luck), but is a collection of what has been
+ // problematic thus far.
+
+ // see checkVALUHazards()
+ if (!ST.has12DWordStoreHazard())
+ return 0;
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ int WaitStatesNeeded = 0;
+
+ for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
+ I != E; ++I) {
+ const MachineOperand &Op = IA->getOperand(I);
+ if (Op.isReg() && Op.isDef()) {
+ WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
+ }
}
+
return WaitStatesNeeded;
}
int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const MachineRegisterInfo &MRI =
- RWLane->getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
const MachineOperand *LaneSelectOp =
TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
@@ -568,11 +661,8 @@ int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
}
int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
- if (!ST.hasReadM0Hazard())
- return 0;
-
const SIInstrInfo *TII = ST.getInstrInfo();
- int SMovRelWaitStates = 1;
+ const int SMovRelWaitStates = 1;
auto IsHazardFn = [TII] (MachineInstr *MI) {
return TII->isSALU(*MI);
};
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 5680c3de6a1a..f9a6e395a454 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -14,6 +14,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
#define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
+#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
#include <list>
@@ -22,8 +23,11 @@ namespace llvm {
class MachineFunction;
class MachineInstr;
+class MachineOperand;
+class MachineRegisterInfo;
class ScheduleDAG;
class SIInstrInfo;
+class SIRegisterInfo;
class SISubtarget;
class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
@@ -35,6 +39,20 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
const MachineFunction &MF;
const SISubtarget &ST;
const SIInstrInfo &TII;
+ const SIRegisterInfo &TRI;
+
+ /// RegUnits of uses in the current soft memory clause.
+ BitVector ClauseUses;
+
+ /// RegUnits of defs in the current soft memory clause.
+ BitVector ClauseDefs;
+
+ void resetClause() {
+ ClauseUses.reset();
+ ClauseDefs.reset();
+ }
+
+ void addClauseInst(const MachineInstr &MI);
int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
int getWaitStatesSinceDef(unsigned Reg,
@@ -42,7 +60,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
[](MachineInstr *) { return true; });
int getWaitStatesSinceSetReg(function_ref<bool(MachineInstr *)> IsHazard);
- int checkSMEMSoftClauseHazards(MachineInstr *SMEM);
+ int checkSoftClauseHazards(MachineInstr *SMEM);
int checkSMRDHazards(MachineInstr *SMRD);
int checkVMEMHazards(MachineInstr* VMEM);
int checkDPPHazards(MachineInstr *DPP);
@@ -51,8 +69,10 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
int checkSetRegHazards(MachineInstr *SetRegInstr);
int createsVALUHazard(const MachineInstr &MI);
int checkVALUHazards(MachineInstr *VALU);
+ int checkVALUHazardsHelper(const MachineOperand &Def, const MachineRegisterInfo &MRI);
int checkRWLaneHazards(MachineInstr *RWLane);
int checkRFEHazards(MachineInstr *RFE);
+ int checkInlineAsmHazards(MachineInstr *IA);
int checkAnyInstHazards(MachineInstr *MI);
int checkReadM0Hazards(MachineInstr *SMovRel);
public:
diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp
new file mode 100644
index 000000000000..ba8211b189cf
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -0,0 +1,364 @@
+//===---------------------------- GCNILPSched.cpp - -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+namespace {
+
+class GCNILPScheduler {
+ struct Candidate : ilist_node<Candidate> {
+ SUnit *SU;
+
+ Candidate(SUnit *SU_)
+ : SU(SU_) {}
+ };
+
+ SpecificBumpPtrAllocator<Candidate> Alloc;
+ typedef simple_ilist<Candidate> Queue;
+ Queue PendingQueue;
+ Queue AvailQueue;
+ unsigned CurQueueId = 0;
+
+ std::vector<unsigned> SUNumbers;
+
+ /// CurCycle - The current scheduler state corresponds to this cycle.
+ unsigned CurCycle = 0;
+
+ unsigned getNodePriority(const SUnit *SU) const;
+
+ const SUnit *pickBest(const SUnit *left, const SUnit *right);
+ Candidate* pickCandidate();
+
+ void releasePending();
+ void advanceToCycle(unsigned NextCycle);
+ void releasePredecessors(const SUnit* SU);
+
+public:
+ std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,
+ const ScheduleDAG &DAG);
+};
+} // namespace
+
+/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
+/// Smaller number is the higher priority.
+static unsigned
+CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
+ unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum];
+ if (SethiUllmanNumber != 0)
+ return SethiUllmanNumber;
+
+ unsigned Extra = 0;
+ for (const SDep &Pred : SU->Preds) {
+ if (Pred.isCtrl()) continue; // ignore chain preds
+ SUnit *PredSU = Pred.getSUnit();
+ unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers);
+ if (PredSethiUllman > SethiUllmanNumber) {
+ SethiUllmanNumber = PredSethiUllman;
+ Extra = 0;
+ }
+ else if (PredSethiUllman == SethiUllmanNumber)
+ ++Extra;
+ }
+
+ SethiUllmanNumber += Extra;
+
+ if (SethiUllmanNumber == 0)
+ SethiUllmanNumber = 1;
+
+ return SethiUllmanNumber;
+}
+
+// Lower priority means schedule further down. For bottom-up scheduling, lower
+// priority SUs are scheduled before higher priority SUs.
+unsigned GCNILPScheduler::getNodePriority(const SUnit *SU) const {
+ assert(SU->NodeNum < SUNumbers.size());
+ if (SU->NumSuccs == 0 && SU->NumPreds != 0)
+ // If SU does not have a register use, i.e. it doesn't produce a value
+ // that would be consumed (e.g. store), then it terminates a chain of
+ // computation. Give it a large SethiUllman number so it will be
+ // scheduled right before its predecessors that it doesn't lengthen
+ // their live ranges.
+ return 0xffff;
+
+ if (SU->NumPreds == 0 && SU->NumSuccs != 0)
+ // If SU does not have a register def, schedule it close to its uses
+ // because it does not lengthen any live ranges.
+ return 0;
+
+ return SUNumbers[SU->NodeNum];
+}
+
+/// closestSucc - Returns the scheduled cycle of the successor which is
+/// closest to the current cycle.
+static unsigned closestSucc(const SUnit *SU) {
+ unsigned MaxHeight = 0;
+ for (const SDep &Succ : SU->Succs) {
+ if (Succ.isCtrl()) continue; // ignore chain succs
+ unsigned Height = Succ.getSUnit()->getHeight();
+ // If there are bunch of CopyToRegs stacked up, they should be considered
+ // to be at the same position.
+ if (Height > MaxHeight)
+ MaxHeight = Height;
+ }
+ return MaxHeight;
+}
+
+/// calcMaxScratches - Returns an cost estimate of the worse case requirement
+/// for scratch registers, i.e. number of data dependencies.
+static unsigned calcMaxScratches(const SUnit *SU) {
+ unsigned Scratches = 0;
+ for (const SDep &Pred : SU->Preds) {
+ if (Pred.isCtrl()) continue; // ignore chain preds
+ Scratches++;
+ }
+ return Scratches;
+}
+
+// Return -1 if left has higher priority, 1 if right has higher priority.
+// Return 0 if latency-based priority is equivalent.
+static int BUCompareLatency(const SUnit *left, const SUnit *right) {
+ // Scheduling an instruction that uses a VReg whose postincrement has not yet
+ // been scheduled will induce a copy. Model this as an extra cycle of latency.
+ int LHeight = (int)left->getHeight();
+ int RHeight = (int)right->getHeight();
+
+ // If either node is scheduling for latency, sort them by height/depth
+ // and latency.
+
+ // If neither instruction stalls (!LStall && !RStall) and HazardRecognizer
+ // is enabled, grouping instructions by cycle, then its height is already
+ // covered so only its depth matters. We also reach this point if both stall
+ // but have the same height.
+ if (LHeight != RHeight)
+ return LHeight > RHeight ? 1 : -1;
+
+ int LDepth = left->getDepth();
+ int RDepth = right->getDepth();
+ if (LDepth != RDepth) {
+ DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum
+ << ") depth " << LDepth << " vs SU (" << right->NodeNum
+ << ") depth " << RDepth << "\n");
+ return LDepth < RDepth ? 1 : -1;
+ }
+ if (left->Latency != right->Latency)
+ return left->Latency > right->Latency ? 1 : -1;
+
+ return 0;
+}
+
+const SUnit *GCNILPScheduler::pickBest(const SUnit *left, const SUnit *right)
+{
+ // TODO: add register pressure lowering checks
+
+ bool const DisableSchedCriticalPath = false;
+ int MaxReorderWindow = 6;
+ if (!DisableSchedCriticalPath) {
+ int spread = (int)left->getDepth() - (int)right->getDepth();
+ if (std::abs(spread) > MaxReorderWindow) {
+ DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
+ << left->getDepth() << " != SU(" << right->NodeNum << "): "
+ << right->getDepth() << "\n");
+ return left->getDepth() < right->getDepth() ? right : left;
+ }
+ }
+
+ bool const DisableSchedHeight = false;
+ if (!DisableSchedHeight && left->getHeight() != right->getHeight()) {
+ int spread = (int)left->getHeight() - (int)right->getHeight();
+ if (std::abs(spread) > MaxReorderWindow)
+ return left->getHeight() > right->getHeight() ? right : left;
+ }
+
+ // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down.
+ unsigned LPriority = getNodePriority(left);
+ unsigned RPriority = getNodePriority(right);
+
+ if (LPriority != RPriority)
+ return LPriority > RPriority ? right : left;
+
+ // Try schedule def + use closer when Sethi-Ullman numbers are the same.
+ // e.g.
+ // t1 = op t2, c1
+ // t3 = op t4, c2
+ //
+ // and the following instructions are both ready.
+ // t2 = op c3
+ // t4 = op c4
+ //
+ // Then schedule t2 = op first.
+ // i.e.
+ // t4 = op c4
+ // t2 = op c3
+ // t1 = op t2, c1
+ // t3 = op t4, c2
+ //
+ // This creates more short live intervals.
+ unsigned LDist = closestSucc(left);
+ unsigned RDist = closestSucc(right);
+ if (LDist != RDist)
+ return LDist < RDist ? right : left;
+
+ // How many registers becomes live when the node is scheduled.
+ unsigned LScratch = calcMaxScratches(left);
+ unsigned RScratch = calcMaxScratches(right);
+ if (LScratch != RScratch)
+ return LScratch > RScratch ? right : left;
+
+ bool const DisableSchedCycles = false;
+ if (!DisableSchedCycles) {
+ int result = BUCompareLatency(left, right);
+ if (result != 0)
+ return result > 0 ? right : left;
+ return left;
+ }
+ else {
+ if (left->getHeight() != right->getHeight())
+ return (left->getHeight() > right->getHeight()) ? right : left;
+
+ if (left->getDepth() != right->getDepth())
+ return (left->getDepth() < right->getDepth()) ? right : left;
+ }
+
+ assert(left->NodeQueueId && right->NodeQueueId &&
+ "NodeQueueId cannot be zero");
+ return (left->NodeQueueId > right->NodeQueueId) ? right : left;
+}
+
+GCNILPScheduler::Candidate* GCNILPScheduler::pickCandidate() {
+ if (AvailQueue.empty())
+ return nullptr;
+ auto Best = AvailQueue.begin();
+ for (auto I = std::next(AvailQueue.begin()), E = AvailQueue.end(); I != E; ++I) {
+ auto NewBestSU = pickBest(Best->SU, I->SU);
+ if (NewBestSU != Best->SU) {
+ assert(NewBestSU == I->SU);
+ Best = I;
+ }
+ }
+ return &*Best;
+}
+
+void GCNILPScheduler::releasePending() {
+ // Check to see if any of the pending instructions are ready to issue. If
+ // so, add them to the available queue.
+ for(auto I = PendingQueue.begin(), E = PendingQueue.end(); I != E;) {
+ auto &C = *I++;
+ if (C.SU->getHeight() <= CurCycle) {
+ PendingQueue.remove(C);
+ AvailQueue.push_back(C);
+ C.SU->NodeQueueId = CurQueueId++;
+ }
+ }
+}
+
+/// Move the scheduler state forward by the specified number of Cycles.
+void GCNILPScheduler::advanceToCycle(unsigned NextCycle) {
+ if (NextCycle <= CurCycle)
+ return;
+ CurCycle = NextCycle;
+ releasePending();
+}
+
+void GCNILPScheduler::releasePredecessors(const SUnit* SU) {
+ for (const auto &PredEdge : SU->Preds) {
+ auto PredSU = PredEdge.getSUnit();
+ if (PredEdge.isWeak())
+ continue;
+ assert(PredSU->isBoundaryNode() || PredSU->NumSuccsLeft > 0);
+
+ PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge.getLatency());
+
+ if (!PredSU->isBoundaryNode() && --PredSU->NumSuccsLeft == 0)
+ PendingQueue.push_front(*new (Alloc.Allocate()) Candidate(PredSU));
+ }
+}
+
+std::vector<const SUnit*>
+GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots,
+ const ScheduleDAG &DAG) {
+ auto &SUnits = const_cast<ScheduleDAG&>(DAG).SUnits;
+
+ std::vector<SUnit> SUSavedCopy;
+ SUSavedCopy.resize(SUnits.size());
+
+ // we cannot save only those fields we touch: some of them are private
+ // so save units verbatim: this assumes SUnit should have value semantics
+ for (const SUnit &SU : SUnits)
+ SUSavedCopy[SU.NodeNum] = SU;
+
+ SUNumbers.assign(SUnits.size(), 0);
+ for (const SUnit &SU : SUnits)
+ CalcNodeSethiUllmanNumber(&SU, SUNumbers);
+
+ for (auto SU : BotRoots) {
+ AvailQueue.push_back(
+ *new (Alloc.Allocate()) Candidate(const_cast<SUnit*>(SU)));
+ }
+ releasePredecessors(&DAG.ExitSU);
+
+ std::vector<const SUnit*> Schedule;
+ Schedule.reserve(SUnits.size());
+ while (true) {
+ if (AvailQueue.empty() && !PendingQueue.empty()) {
+ auto EarliestSU = std::min_element(
+ PendingQueue.begin(), PendingQueue.end(),
+ [=](const Candidate& C1, const Candidate& C2) {
+ return C1.SU->getHeight() < C2.SU->getHeight();
+ })->SU;
+ advanceToCycle(std::max(CurCycle + 1, EarliestSU->getHeight()));
+ }
+ if (AvailQueue.empty())
+ break;
+
+ DEBUG(
+ dbgs() << "\n=== Picking candidate\n"
+ "Ready queue:";
+ for (auto &C : AvailQueue)
+ dbgs() << ' ' << C.SU->NodeNum;
+ dbgs() << '\n';
+ );
+
+ auto C = pickCandidate();
+ assert(C);
+ AvailQueue.remove(*C);
+ auto SU = C->SU;
+ DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+
+ advanceToCycle(SU->getHeight());
+
+ releasePredecessors(SU);
+ Schedule.push_back(SU);
+ SU->isScheduled = true;
+ }
+ assert(SUnits.size() == Schedule.size());
+
+ std::reverse(Schedule.begin(), Schedule.end());
+
+ // restore units
+ for (auto &SU : SUnits)
+ SU = SUSavedCopy[SU.NodeNum];
+
+ return Schedule;
+}
+
+namespace llvm {
+std::vector<const SUnit*> makeGCNILPScheduler(ArrayRef<const SUnit*> BotRoots,
+ const ScheduleDAG &DAG) {
+ GCNILPScheduler S;
+ return S.schedule(BotRoots, DAG);
+}
+}
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 2e7641cda375..a0e4f7ff24cb 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -1,4 +1,4 @@
-//===--------------------- GCNIterativeScheduler.cpp - --------------------===//
+//===- GCNIterativeScheduler.cpp ------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,21 +6,40 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-/// \file
-//
-//===----------------------------------------------------------------------===//
#include "GCNIterativeScheduler.h"
+#include "AMDGPUSubtarget.h"
+#include "GCNRegPressure.h"
#include "GCNSchedStrategy.h"
-#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <type_traits>
+#include <vector>
using namespace llvm;
#define DEBUG_TYPE "machine-scheduler"
namespace llvm {
- std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
+
+std::vector<const SUnit *> makeMinRegSchedule(ArrayRef<const SUnit *> TopRoots,
+ const ScheduleDAG &DAG);
+
+ std::vector<const SUnit*> makeGCNILPScheduler(ArrayRef<const SUnit*> BotRoots,
const ScheduleDAG &DAG);
}
@@ -44,8 +63,8 @@ static void printRegion(raw_ostream &OS,
unsigned MaxInstNum =
std::numeric_limits<unsigned>::max()) {
auto BB = Begin->getParent();
- OS << BB->getParent()->getName() << ":BB#" << BB->getNumber()
- << ' ' << BB->getName() << ":\n";
+ OS << BB->getParent()->getName() << ":" << printMBBReference(*BB) << ' '
+ << BB->getName() << ":\n";
auto I = Begin;
MaxInstNum = std::max(MaxInstNum, 1u);
for (; I != End && MaxInstNum; ++I, --MaxInstNum) {
@@ -117,13 +136,14 @@ void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
OS << "RP after: ";
After.print(OS, &ST);
}
-
#endif
// DAG builder helper
class GCNIterativeScheduler::BuildDAG {
GCNIterativeScheduler &Sch;
- SmallVector<SUnit*, 8> TopRoots;
+ SmallVector<SUnit *, 8> TopRoots;
+
+ SmallVector<SUnit*, 8> BotRoots;
public:
BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)
: Sch(_Sch) {
@@ -134,17 +154,20 @@ public:
Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,
/*TrackLaneMask*/true);
Sch.Topo.InitDAGTopologicalSorting();
-
- SmallVector<SUnit*, 8> BotRoots;
Sch.findRootsAndBiasEdges(TopRoots, BotRoots);
}
+
~BuildDAG() {
Sch.BaseClass::exitRegion();
Sch.BaseClass::finishBlock();
}
- ArrayRef<const SUnit*> getTopRoots() const {
+
+ ArrayRef<const SUnit *> getTopRoots() const {
return TopRoots;
}
+ ArrayRef<SUnit*> getBottomRoots() const {
+ return BotRoots;
+ }
};
class GCNIterativeScheduler::OverrideLegacyStrategy {
@@ -152,6 +175,7 @@ class GCNIterativeScheduler::OverrideLegacyStrategy {
Region &Rgn;
std::unique_ptr<MachineSchedStrategy> SaveSchedImpl;
GCNRegPressure SaveMaxRP;
+
public:
OverrideLegacyStrategy(Region &R,
MachineSchedStrategy &OverrideStrategy,
@@ -165,12 +189,14 @@ public:
Sch.BaseClass::startBlock(BB);
Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
}
+
~OverrideLegacyStrategy() {
Sch.BaseClass::exitRegion();
Sch.BaseClass::finishBlock();
Sch.SchedImpl.release();
Sch.SchedImpl = std::move(SaveSchedImpl);
}
+
void schedule() {
assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
DEBUG(dbgs() << "\nScheduling ";
@@ -183,6 +209,7 @@ public:
Rgn.Begin = Sch.RegionBegin;
Rgn.MaxPressure.clear();
}
+
void restoreOrder() {
assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
// DAG SUnits are stored using original region's order
@@ -192,6 +219,7 @@ public:
};
namespace {
+
// just a stub to make base class happy
class SchedStrategyStub : public MachineSchedStrategy {
public:
@@ -203,7 +231,8 @@ public:
void releaseTopNode(SUnit *SU) override {}
void releaseBottomNode(SUnit *SU) override {}
};
-} // namespace
+
+} // end anonymous namespace
GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,
StrategyKind S)
@@ -298,6 +327,7 @@ void GCNIterativeScheduler::finalizeSchedule() { // overriden
case SCHEDULE_MINREGONLY: scheduleMinReg(); break;
case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break;
case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break;
+ case SCHEDULE_ILP: scheduleILP(false); break;
}
}
@@ -528,3 +558,43 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
MaxPressure = RP;
}
}
+
+///////////////////////////////////////////////////////////////////////////////
+// ILP scheduler port
+
+void GCNIterativeScheduler::scheduleILP(
+ bool TryMaximizeOccupancy) {
+ const auto &ST = MF.getSubtarget<SISubtarget>();
+ auto TgtOcc = std::min(ST.getOccupancyWithLocalMemSize(MF),
+ ST.getWavesPerEU(MF.getFunction()).second);
+
+ sortRegionsByPressure(TgtOcc);
+ auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+
+ if (TryMaximizeOccupancy && Occ < TgtOcc)
+ Occ = tryMaximizeOccupancy(TgtOcc);
+
+ TgtOcc = std::min(Occ, TgtOcc);
+ DEBUG(dbgs() << "Scheduling using default scheduler, "
+ "target occupancy = " << TgtOcc << '\n');
+
+ for (auto R : Regions) {
+ BuildDAG DAG(*R, *this);
+ const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this);
+
+ const auto RP = getSchedulePressure(*R, ILPSchedule);
+ DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+
+ if (RP.getOccupancy(ST) < TgtOcc) {
+ DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+ if (R->BestSchedule.get() &&
+ R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
+ DEBUG(dbgs() << ", scheduling minimal register\n");
+ scheduleBest(*R);
+ }
+ } else {
+ scheduleRegion(*R, ILPSchedule, RP);
+ DEBUG(printSchedResult(dbgs(), R, RP));
+ }
+ }
+}
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.h b/lib/Target/AMDGPU/GCNIterativeScheduler.h
index df3afce21ebc..14ef5147f32a 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.h
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.h
@@ -1,4 +1,4 @@
-//===--------- GCNIterativeScheduler.h - GCN Scheduler -*- C++ -*----------===//
+//===- GCNIterativeScheduler.h - GCN Scheduler ------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,27 +6,34 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-/// \file
-//
-//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
#define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
#include "GCNRegPressure.h"
-
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/Support/Allocator.h"
+#include <limits>
+#include <memory>
+#include <vector>
namespace llvm {
+class MachineInstr;
+class SUnit;
+class raw_ostream;
+
class GCNIterativeScheduler : public ScheduleDAGMILive {
- typedef ScheduleDAGMILive BaseClass;
+ using BaseClass = ScheduleDAGMILive;
+
public:
enum StrategyKind {
SCHEDULE_MINREGONLY,
SCHEDULE_MINREGFORCED,
- SCHEDULE_LEGACYMAXOCCUPANCY
+ SCHEDULE_LEGACYMAXOCCUPANCY,
+ SCHEDULE_ILP
};
GCNIterativeScheduler(MachineSchedContext *C,
@@ -42,11 +49,10 @@ public:
void finalizeSchedule() override;
protected:
-
- typedef ArrayRef<const SUnit*> ScheduleRef;
+ using ScheduleRef = ArrayRef<const SUnit *>;
struct TentativeSchedule {
- std::vector<MachineInstr*> Schedule;
+ std::vector<MachineInstr *> Schedule;
GCNRegPressure MaxPressure;
};
@@ -103,6 +109,7 @@ protected:
void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true);
void scheduleMinReg(bool force = false);
+ void scheduleILP(bool TryMaximizeOccupancy = true);
void printRegions(raw_ostream &OS) const;
void printSchedResult(raw_ostream &OS,
@@ -113,6 +120,6 @@ protected:
const GCNRegPressure &After) const;
};
-} // End namespace llvm
+} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index 0657f67b217d..9904b5f0f4ba 100644
--- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -1,4 +1,4 @@
-//===----------------------- GCNMinRegStrategy.cpp - ----------------------===//
+//===- GCNMinRegStrategy.cpp ----------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,18 +6,27 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-/// \file
-//
-//===----------------------------------------------------------------------===//
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/ADT/simple_ilist.h"
#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <vector>
using namespace llvm;
#define DEBUG_TYPE "machine-scheduler"
namespace {
+
class GCNMinRegScheduler {
struct Candidate : ilist_node<Candidate> {
const SUnit *SU;
@@ -28,7 +37,7 @@ class GCNMinRegScheduler {
};
SpecificBumpPtrAllocator<Candidate> Alloc;
- typedef simple_ilist<Candidate> Queue;
+ using Queue = simple_ilist<Candidate>;
Queue RQ; // Ready queue
std::vector<unsigned> NumPreds;
@@ -72,7 +81,8 @@ public:
std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,
const ScheduleDAG &DAG);
};
-} // namespace
+
+} // end anonymous namespace
void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) {
NumPreds.resize(SUnits.size());
@@ -104,7 +114,9 @@ int GCNMinRegScheduler::getNotReadySuccessors(const SUnit *SU) const {
template <typename Calc>
unsigned GCNMinRegScheduler::findMax(unsigned Num, Calc C) {
assert(!RQ.empty() && Num <= RQ.size());
- typedef decltype(C(*RQ.begin())) T;
+
+ using T = decltype(C(*RQ.begin())) ;
+
T Max = std::numeric_limits<T>::min();
unsigned NumMax = 0;
for (auto I = RQ.begin(); Num; --Num) {
@@ -260,9 +272,11 @@ GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots,
}
namespace llvm {
+
std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
const ScheduleDAG &DAG) {
GCNMinRegScheduler S;
return S.schedule(TopRoots, DAG);
}
-}
+
+} // end namespace llvm
diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td
new file mode 100644
index 000000000000..b2a3f652abd8
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNProcessors.td
@@ -0,0 +1,154 @@
+//===-- GCNProcessors.td - GCN Processor definitions ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// The code produced for "generic" is only useful for tests and cannot
+// reasonably be expected to execute on any particular target.
+def : ProcessorModel<"generic", NoSchedModel,
+ [FeatureGCN, FeatureWavefrontSize64]
+>;
+
+//===----------------------------------------------------------------------===//
+// GCN GFX6 (Southern Islands (SI)).
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"gfx600", SIFullSpeedModel,
+ [FeatureISAVersion6_0_0]
+>;
+
+def : ProcessorModel<"tahiti", SIFullSpeedModel,
+ [FeatureISAVersion6_0_0]
+>;
+
+def : ProcessorModel<"gfx601", SIQuarterSpeedModel,
+ [FeatureISAVersion6_0_1]
+>;
+
+def : ProcessorModel<"hainan", SIQuarterSpeedModel,
+ [FeatureISAVersion6_0_1]
+>;
+
+def : ProcessorModel<"oland", SIQuarterSpeedModel,
+ [FeatureISAVersion6_0_1]
+>;
+
+def : ProcessorModel<"pitcairn", SIQuarterSpeedModel,
+ [FeatureISAVersion6_0_1]
+>;
+
+def : ProcessorModel<"verde", SIQuarterSpeedModel,
+ [FeatureISAVersion6_0_1]
+>;
+
+//===----------------------------------------------------------------------===//
+// GCN GFX7 (Sea Islands (CI)).
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"gfx700", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_0]
+>;
+
+def : ProcessorModel<"kaveri", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_0]
+>;
+
+def : ProcessorModel<"gfx701", SIFullSpeedModel,
+ [FeatureISAVersion7_0_1]
+>;
+
+def : ProcessorModel<"hawaii", SIFullSpeedModel,
+ [FeatureISAVersion7_0_1]
+>;
+
+def : ProcessorModel<"gfx702", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_2]
+>;
+
+def : ProcessorModel<"gfx703", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_3]
+>;
+
+def : ProcessorModel<"kabini", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_3]
+>;
+
+def : ProcessorModel<"mullins", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_3]
+>;
+
+def : ProcessorModel<"gfx704", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_4]
+>;
+
+def : ProcessorModel<"bonaire", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_4]
+>;
+
+//===----------------------------------------------------------------------===//
+// GCN GFX8 (Volcanic Islands (VI)).
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"gfx800", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_0]
+>;
+
+def : ProcessorModel<"iceland", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_0]
+>;
+
+def : ProcessorModel<"gfx801", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_1]
+>;
+
+def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_1]
+>;
+
+def : ProcessorModel<"gfx802", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_2]
+>;
+
+def : ProcessorModel<"tonga", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_2]
+>;
+
+def : ProcessorModel<"gfx803", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"fiji", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"polaris10", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"polaris11", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"gfx810", SIQuarterSpeedModel,
+ [FeatureISAVersion8_1_0]
+>;
+
+def : ProcessorModel<"stoney", SIQuarterSpeedModel,
+ [FeatureISAVersion8_1_0]
+>;
+
+//===----------------------------------------------------------------------===//
+// GCN GFX9.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"gfx900", SIQuarterSpeedModel,
+ [FeatureISAVersion9_0_0]
+>;
+
+def : ProcessorModel<"gfx902", SIQuarterSpeedModel,
+ [FeatureISAVersion9_0_2]
+>;
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 1d02c7fdffbf..992bb7cceb6f 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -1,4 +1,4 @@
-//===------------------------- GCNRegPressure.cpp - -----------------------===//
+//===- GCNRegPressure.cpp -------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,13 +6,26 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-/// \file
-//
-//===----------------------------------------------------------------------===//
#include "GCNRegPressure.h"
+#include "AMDGPUSubtarget.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
using namespace llvm;
@@ -36,7 +49,7 @@ void llvm::printLivesAt(SlotIndex SI,
for (const auto &S : LI.subranges()) {
if (!S.liveAt(SI)) continue;
if (firstTime) {
- dbgs() << " " << PrintReg(Reg, MRI.getTargetRegisterInfo())
+ dbgs() << " " << printReg(Reg, MRI.getTargetRegisterInfo())
<< '\n';
firstTime = false;
}
@@ -63,7 +76,6 @@ static bool isEqual(const GCNRPTracker::LiveRegSet &S1,
}
return true;
}
-
#endif
///////////////////////////////////////////////////////////////////////////////
@@ -107,7 +119,7 @@ void GCNRegPressure::inc(unsigned Reg,
assert(PrevMask < NewMask);
Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] +=
- Sign * countPopulation((~PrevMask & NewMask).getAsInteger());
+ Sign * (~PrevMask & NewMask).getNumLanes();
if (PrevMask.none()) {
assert(NewMask.any());
@@ -177,7 +189,6 @@ void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
}
#endif
-
static LaneBitmask getDefRegMask(const MachineOperand &MO,
const MachineRegisterInfo &MRI) {
assert(MO.isDef() && MO.isReg() &&
@@ -201,7 +212,7 @@ static LaneBitmask getUsedRegMask(const MachineOperand &MO,
return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
auto MaxMask = MRI.getMaxLaneMaskForVReg(MO.getReg());
- if (MaxMask.getAsInteger() == 1) // cannot have subregs
+ if (MaxMask == LaneBitmask::getLane(0)) // cannot have subregs
return MaxMask;
// For a tentative schedule LIS isn't updated yet but livemask should remain
@@ -430,12 +441,12 @@ static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
for (auto const &P : TrackedLR) {
auto I = LISLR.find(P.first);
if (I == LISLR.end()) {
- dbgs() << " " << PrintReg(P.first, TRI)
+ dbgs() << " " << printReg(P.first, TRI)
<< ":L" << PrintLaneMask(P.second)
<< " isn't found in LIS reported set\n";
}
else if (I->second != P.second) {
- dbgs() << " " << PrintReg(P.first, TRI)
+ dbgs() << " " << printReg(P.first, TRI)
<< " masks doesn't match: LIS reported "
<< PrintLaneMask(I->second)
<< ", tracked "
@@ -446,7 +457,7 @@ static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
for (auto const &P : LISLR) {
auto I = TrackedLR.find(P.first);
if (I == TrackedLR.end()) {
- dbgs() << " " << PrintReg(P.first, TRI)
+ dbgs() << " " << printReg(P.first, TRI)
<< ":L" << PrintLaneMask(P.second)
<< " isn't found in tracked set\n";
}
@@ -484,7 +495,7 @@ void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
auto It = LiveRegs.find(Reg);
if (It != LiveRegs.end() && It->second.any())
- OS << ' ' << PrintVRegOrUnit(Reg, TRI) << ':'
+ OS << ' ' << printVRegOrUnit(Reg, TRI) << ':'
<< PrintLaneMask(It->second);
}
OS << '\n';
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index 5dfe44053e72..e418aa0fe911 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -1,4 +1,4 @@
-//===---------------------- GCNRegPressure.h -*- C++ -*--------------------===//
+//===- GCNRegPressure.h -----------------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,20 +6,26 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-/// \file
-//
-//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
#include "AMDGPUSubtarget.h"
-
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
#include <limits>
namespace llvm {
+class MachineRegisterInfo;
+class raw_ostream;
+
struct GCNRegPressure {
enum RegKind {
SGPR32,
@@ -68,7 +74,7 @@ struct GCNRegPressure {
return !(*this == O);
}
- void print(raw_ostream &OS, const SISubtarget *ST=nullptr) const;
+ void print(raw_ostream &OS, const SISubtarget *ST = nullptr) const;
void dump() const { print(dbgs()); }
private:
@@ -89,7 +95,7 @@ inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
class GCNRPTracker {
public:
- typedef DenseMap<unsigned, LaneBitmask> LiveRegSet;
+ using LiveRegSet = DenseMap<unsigned, LaneBitmask>;
protected:
const LiveIntervals &LIS;
@@ -97,7 +103,9 @@ protected:
GCNRegPressure CurPressure, MaxPressure;
const MachineInstr *LastTrackedMI = nullptr;
mutable const MachineRegisterInfo *MRI = nullptr;
+
GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+
public:
// live regs for the current state
const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
@@ -111,9 +119,11 @@ public:
MaxPressure.clear();
return Res;
}
+
decltype(LiveRegs) moveLiveRegs() {
return std::move(LiveRegs);
}
+
static void printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
const MachineRegisterInfo &MRI);
};
@@ -121,6 +131,7 @@ public:
class GCNUpwardRPTracker : public GCNRPTracker {
public:
GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
+
// reset tracker to the point just below MI
// filling live regs upon this point using LIS
void reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
@@ -202,6 +213,6 @@ void printLivesAt(SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);
-} // End namespace llvm
+} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 155b400ba022..d414b899050a 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -37,7 +37,7 @@ static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
ST.getOccupancyWithNumVGPRs(VGPRs));
return std::min(MinRegOccupancy,
ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
- *MF.getFunction()));
+ MF.getFunction()));
}
void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
@@ -315,7 +315,7 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
ST(MF.getSubtarget<SISubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(),
- *MF.getFunction())),
+ MF.getFunction())),
MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
@@ -330,8 +330,9 @@ void GCNScheduleDAGMILive::schedule() {
std::vector<MachineInstr*> Unsched;
Unsched.reserve(NumRegionInstrs);
- for (auto &I : *this)
+ for (auto &I : *this) {
Unsched.push_back(&I);
+ }
GCNRegPressure PressureBefore;
if (LIS) {
@@ -387,10 +388,14 @@ void GCNScheduleDAGMILive::schedule() {
DEBUG(dbgs() << "Attempting to revert scheduling.\n");
RegionEnd = RegionBegin;
for (MachineInstr *MI : Unsched) {
+ if (MI->isDebugValue())
+ continue;
+
if (MI->getIterator() != RegionEnd) {
BB->remove(MI);
BB->insert(RegionEnd, MI);
- LIS->handleMove(*MI, true);
+ if (!MI->isDebugValue())
+ LIS->handleMove(*MI, true);
}
// Reset read-undef flags and update them later.
for (auto &Op : MI->operands())
@@ -398,13 +403,15 @@ void GCNScheduleDAGMILive::schedule() {
Op.setIsUndef(false);
RegisterOperands RegOpers;
RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
- if (ShouldTrackLaneMasks) {
- // Adjust liveness and add missing dead+read-undef flags.
- SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
- RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
- } else {
- // Adjust for missing dead-def flags.
- RegOpers.detectDeadDefs(*MI, *LIS);
+ if (!MI->isDebugValue()) {
+ if (ShouldTrackLaneMasks) {
+ // Adjust liveness and add missing dead+read-undef flags.
+ SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+ RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+ } else {
+ // Adjust for missing dead-def flags.
+ RegOpers.detectDeadDefs(*MI, *LIS);
+ }
}
RegionEnd = MI->getIterator();
++RegionEnd;
@@ -531,9 +538,8 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
}
DEBUG(dbgs() << "********** MI Scheduling **********\n");
- DEBUG(dbgs() << MF.getName()
- << ":BB#" << MBB->getNumber() << " " << MBB->getName()
- << "\n From: " << *begin() << " To: ";
+ DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " "
+ << MBB->getName() << "\n From: " << *begin() << " To: ";
if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
else dbgs() << "End";
dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index a844081db5b2..67663d39967c 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -72,9 +72,9 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
}
-void AMDGPUInstPrinter::printS16ImmDecOperand(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printS13ImmDecOperand(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
- O << formatDec(static_cast<int16_t>(MI->getOperand(OpNo).getImm()));
+ O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm()));
}
void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
@@ -129,7 +129,7 @@ void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo,
uint16_t Imm = MI->getOperand(OpNo).getImm();
if (Imm != 0) {
O << ((OpNo == 0)? "offset:" : " offset:");
- printS16ImmDecOperand(MI, OpNo, O);
+ printS13ImmDecOperand(MI, OpNo, O);
}
}
@@ -344,16 +344,6 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
} else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(RegNo)) {
O << 's';
NumRegs = 16;
- } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(RegNo)) {
- O << "ttmp";
- NumRegs = 2;
- // Trap temps start at offset 112. TODO: Get this from tablegen.
- RegIdx -= 112;
- } else if (MRI.getRegClass(AMDGPU::TTMP_128RegClassID).contains(RegNo)) {
- O << "ttmp";
- NumRegs = 4;
- // Trap temps start at offset 112. TODO: Get this from tablegen.
- RegIdx -= 112;
} else {
O << getRegisterName(RegNo);
return;
@@ -496,6 +486,11 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) {
+ static_cast<R600InstPrinter*>(this)->printOperand(MI, OpNo, O);
+ return;
+ }
+
if (OpNo >= MI->getNumOperands()) {
O << "/*Missing OP" << OpNo << "*/";
return;
@@ -503,15 +498,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isReg()) {
- switch (Op.getReg()) {
- // This is the default predicate state, so we don't need to print it.
- case AMDGPU::PRED_SEL_OFF:
- break;
-
- default:
- printRegOperand(Op.getReg(), O, MRI);
- break;
- }
+ printRegOperand(Op.getReg(), O, MRI);
} else if (Op.isImm()) {
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
switch (Desc.OpInfo[OpNo].OperandType) {
@@ -808,19 +795,25 @@ void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo,
}
}
-static bool allOpsDefaultValue(const int* Ops, int NumOps, int Mod) {
- int DefaultValue = (Mod == SISrcMods::OP_SEL_1);
+static bool allOpsDefaultValue(const int* Ops, int NumOps, int Mod,
+ bool IsPacked, bool HasDstSel) {
+ int DefaultValue = IsPacked && (Mod == SISrcMods::OP_SEL_1);
for (int I = 0; I < NumOps; ++I) {
if (!!(Ops[I] & Mod) != DefaultValue)
return false;
}
+ if (HasDstSel && (Ops[0] & SISrcMods::DST_OP_SEL) != 0)
+ return false;
+
return true;
}
-static void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod,
- raw_ostream &O) {
+void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
+ StringRef Name,
+ unsigned Mod,
+ raw_ostream &O) {
unsigned Opc = MI->getOpcode();
int NumOps = 0;
int Ops[3];
@@ -835,7 +828,15 @@ static void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod,
Ops[NumOps++] = MI->getOperand(Idx).getImm();
}
- if (allOpsDefaultValue(Ops, NumOps, Mod))
+ const bool HasDstSel =
+ NumOps > 0 &&
+ Mod == SISrcMods::OP_SEL_0 &&
+ MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3_OPSEL;
+
+ const bool IsPacked =
+ MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsPacked;
+
+ if (allOpsDefaultValue(Ops, NumOps, Mod, IsPacked, HasDstSel))
return;
O << Name;
@@ -846,6 +847,10 @@ static void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod,
O << !!(Ops[I] & Mod);
}
+ if (HasDstSel) {
+ O << ',' << !!(Ops[0] & SISrcMods::DST_OP_SEL);
+ }
+
O << ']';
}
@@ -931,6 +936,11 @@ void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) {
+ static_cast<R600InstPrinter*>(this)->printMemOperand(MI, OpNo, O);
+ return;
+ }
+
printOperand(MI, OpNo, STI, O);
O << ", ";
printOperand(MI, OpNo + 1, STI, O);
@@ -958,12 +968,19 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- printIfSet(MI, OpNo, O, '|');
+ static_cast<R600InstPrinter*>(this)->printAbs(MI, OpNo, O);
}
void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- printIfSet(MI, OpNo, O, "_SAT");
+ static_cast<R600InstPrinter*>(this)->printClamp(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm())
+ O << " high";
}
void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo,
@@ -988,172 +1005,65 @@ void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- const MCOperand &Op = MI->getOperand(OpNo);
- assert(Op.isImm() || Op.isExpr());
- if (Op.isImm()) {
- int64_t Imm = Op.getImm();
- O << Imm << '(' << BitsToFloat(Imm) << ')';
- }
- if (Op.isExpr()) {
- Op.getExpr()->print(O << '@', &MAI);
- }
+ static_cast<R600InstPrinter*>(this)->printLiteral(MI, OpNo, O);
}
void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- printIfSet(MI, OpNo, O, "*", " ");
+ static_cast<R600InstPrinter*>(this)->printLast(MI, OpNo, O);
}
void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- printIfSet(MI, OpNo, O, '-');
+ static_cast<R600InstPrinter*>(this)->printNeg(MI, OpNo, O);
}
void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- switch (MI->getOperand(OpNo).getImm()) {
- default: break;
- case 1:
- O << " * 2.0";
- break;
- case 2:
- O << " * 4.0";
- break;
- case 3:
- O << " / 2.0";
- break;
- }
+ static_cast<R600InstPrinter*>(this)->printOMOD(MI, OpNo, O);
}
void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- printIfSet(MI, OpNo, O, '+');
+ static_cast<R600InstPrinter*>(this)->printRel(MI, OpNo, O);
}
void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- printIfSet(MI, OpNo, O, "ExecMask,");
+ static_cast<R600InstPrinter*>(this)->printUpdateExecMask(MI, OpNo, O);
}
void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- printIfSet(MI, OpNo, O, "Pred,");
+ static_cast<R600InstPrinter*>(this)->printUpdatePred(MI, OpNo, O);
}
void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- const MCOperand &Op = MI->getOperand(OpNo);
- if (Op.getImm() == 0) {
- O << " (MASKED)";
- }
-}
-
-void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- const char * chans = "XYZW";
- int sel = MI->getOperand(OpNo).getImm();
-
- int chan = sel & 3;
- sel >>= 2;
-
- if (sel >= 512) {
- sel -= 512;
- int cb = sel >> 12;
- sel &= 4095;
- O << cb << '[' << sel << ']';
- } else if (sel >= 448) {
- sel -= 448;
- O << sel;
- } else if (sel >= 0){
- O << sel;
- }
-
- if (sel >= 0)
- O << '.' << chans[chan];
+ static_cast<R600InstPrinter*>(this)->printWrite(MI, OpNo, O);
}
void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- int BankSwizzle = MI->getOperand(OpNo).getImm();
- switch (BankSwizzle) {
- case 1:
- O << "BS:VEC_021/SCL_122";
- break;
- case 2:
- O << "BS:VEC_120/SCL_212";
- break;
- case 3:
- O << "BS:VEC_102/SCL_221";
- break;
- case 4:
- O << "BS:VEC_201";
- break;
- case 5:
- O << "BS:VEC_210";
- break;
- default:
- break;
- }
+ static_cast<R600InstPrinter*>(this)->printBankSwizzle(MI, OpNo, O);
}
void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- unsigned Sel = MI->getOperand(OpNo).getImm();
- switch (Sel) {
- case 0:
- O << 'X';
- break;
- case 1:
- O << 'Y';
- break;
- case 2:
- O << 'Z';
- break;
- case 3:
- O << 'W';
- break;
- case 4:
- O << '0';
- break;
- case 5:
- O << '1';
- break;
- case 7:
- O << '_';
- break;
- default:
- break;
- }
+ static_cast<R600InstPrinter*>(this)->printRSel(MI, OpNo, O);
}
void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- unsigned CT = MI->getOperand(OpNo).getImm();
- switch (CT) {
- case 0:
- O << 'U';
- break;
- case 1:
- O << 'N';
- break;
- default:
- break;
- }
+ static_cast<R600InstPrinter*>(this)->printCT(MI, OpNo, O);
}
void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- int KCacheMode = MI->getOperand(OpNo).getImm();
- if (KCacheMode > 0) {
- int KCacheBank = MI->getOperand(OpNo - 2).getImm();
- O << "CB" << KCacheBank << ':';
- int KCacheAddr = MI->getOperand(OpNo + 2).getImm();
- int LineSize = (KCacheMode == 1) ? 16 : 32;
- O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize;
- }
+ static_cast<R600InstPrinter*>(this)->printKCache(MI, OpNo, O);
}
void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
@@ -1356,3 +1266,198 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
}
#include "AMDGPUGenAsmWriter.inc"
+
+void R600InstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '|');
+}
+
+void R600InstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ int BankSwizzle = MI->getOperand(OpNo).getImm();
+ switch (BankSwizzle) {
+ case 1:
+ O << "BS:VEC_021/SCL_122";
+ break;
+ case 2:
+ O << "BS:VEC_120/SCL_212";
+ break;
+ case 3:
+ O << "BS:VEC_102/SCL_221";
+ break;
+ case 4:
+ O << "BS:VEC_201";
+ break;
+ case 5:
+ O << "BS:VEC_210";
+ break;
+ default:
+ break;
+ }
+}
+
+void R600InstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "_SAT");
+}
+
+void R600InstPrinter::printCT(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned CT = MI->getOperand(OpNo).getImm();
+ switch (CT) {
+ case 0:
+ O << 'U';
+ break;
+ case 1:
+ O << 'N';
+ break;
+ default:
+ break;
+ }
+}
+
+void R600InstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ int KCacheMode = MI->getOperand(OpNo).getImm();
+ if (KCacheMode > 0) {
+ int KCacheBank = MI->getOperand(OpNo - 2).getImm();
+ O << "CB" << KCacheBank << ':';
+ int KCacheAddr = MI->getOperand(OpNo + 2).getImm();
+ int LineSize = (KCacheMode == 1) ? 16 : 32;
+ O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize;
+ }
+}
+
+void R600InstPrinter::printLast(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "*", " ");
+}
+
+void R600InstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ assert(Op.isImm() || Op.isExpr());
+ if (Op.isImm()) {
+ int64_t Imm = Op.getImm();
+ O << Imm << '(' << BitsToFloat(Imm) << ')';
+ }
+ if (Op.isExpr()) {
+ Op.getExpr()->print(O << '@', &MAI);
+ }
+}
+
+void R600InstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '-');
+}
+
+void R600InstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ switch (MI->getOperand(OpNo).getImm()) {
+ default: break;
+ case 1:
+ O << " * 2.0";
+ break;
+ case 2:
+ O << " * 4.0";
+ break;
+ case 3:
+ O << " / 2.0";
+ break;
+ }
+}
+
+void R600InstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ printOperand(MI, OpNo, O);
+ O << ", ";
+ printOperand(MI, OpNo + 1, O);
+}
+
+void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (OpNo >= MI->getNumOperands()) {
+ O << "/*Missing OP" << OpNo << "*/";
+ return;
+ }
+
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ switch (Op.getReg()) {
+ // This is the default predicate state, so we don't need to print it.
+ case AMDGPU::PRED_SEL_OFF:
+ break;
+
+ default:
+ O << getRegisterName(Op.getReg());
+ break;
+ }
+ } else if (Op.isImm()) {
+ O << Op.getImm();
+ } else if (Op.isFPImm()) {
+ // We special case 0.0 because otherwise it will be printed as an integer.
+ if (Op.getFPImm() == 0.0)
+ O << "0.0";
+ else {
+ O << Op.getFPImm();
+ }
+ } else if (Op.isExpr()) {
+ const MCExpr *Exp = Op.getExpr();
+ Exp->print(O, &MAI);
+ } else {
+ O << "/*INV_OP*/";
+ }
+}
+
+void R600InstPrinter::printRel(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '+');
+}
+
+void R600InstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned Sel = MI->getOperand(OpNo).getImm();
+ switch (Sel) {
+ case 0:
+ O << 'X';
+ break;
+ case 1:
+ O << 'Y';
+ break;
+ case 2:
+ O << 'Z';
+ break;
+ case 3:
+ O << 'W';
+ break;
+ case 4:
+ O << '0';
+ break;
+ case 5:
+ O << '1';
+ break;
+ case 7:
+ O << '_';
+ break;
+ default:
+ break;
+ }
+}
+
+void R600InstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "ExecMask,");
+}
+
+void R600InstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "Pred,");
+}
+
+void R600InstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.getImm() == 0) {
+ O << " (MASKED)";
+ }
+}
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index 7bbf99a85f40..d97f04689e18 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -19,8 +19,8 @@ namespace llvm {
class AMDGPUInstPrinter : public MCInstPrinter {
public:
- AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
- const MCRegisterInfo &MRI)
+ AMDGPUInstPrinter(const MCAsmInfo &MAI,
+ const MCInstrInfo &MII, const MCRegisterInfo &MRI)
: MCInstPrinter(MAI, MII, MRI) {}
//Autogenerated by tblgen
@@ -42,7 +42,7 @@ private:
void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printS13ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
@@ -127,6 +127,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printSDWADstUnused(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod,
+ raw_ostream &O);
void printOpSel(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printOpSelHi(const MCInst *MI, unsigned OpNo,
@@ -162,12 +164,16 @@ private:
void printExpTgt(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+public:
static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
StringRef Asm, StringRef Default = "");
static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
char Asm);
+protected:
void printAbs(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printHigh(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printClamp(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printClampSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -190,7 +196,6 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printWrite(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printBankSwizzle(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printRSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -209,6 +214,32 @@ private:
raw_ostream &O);
};
+// FIXME: R600 specific parts of AMDGPUInstrPrinter should be moved here, and
+// MCTargetDesc should be using R600InstPrinter for the R600 target.
+class R600InstPrinter : public AMDGPUInstPrinter {
+public:
+ R600InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : AMDGPUInstPrinter(MAI, MII, MRI) {}
+
+ void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+
} // End namespace llvm
#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index a50e3eb8d9ce..778d4a7ba9d0 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -11,6 +11,7 @@
#include "MCTargetDesc/AMDGPUFixupKinds.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
@@ -43,6 +44,8 @@ public:
llvm_unreachable("Not implemented");
}
bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+ unsigned getMinimumNopSize() const override;
bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
@@ -76,7 +79,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
MCContext *Ctx) {
int64_t SignedValue = static_cast<int64_t>(Value);
- switch (Fixup.getKind()) {
+ switch (static_cast<unsigned>(Fixup.getKind())) {
case AMDGPU::fixup_si_sopp_br: {
int64_t BrImm = (SignedValue - 4) / 4;
@@ -133,6 +136,10 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
return Infos[Kind - FirstTargetFixupKind];
}
+unsigned AMDGPUAsmBackend::getMinimumNopSize() const {
+ return 4;
+}
+
bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
// If the count is not 4-byte aligned, we must be writing data into the text
// section (otherwise we have unaligned instructions, and thus have far
@@ -161,14 +168,30 @@ namespace {
class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
bool Is64Bit;
bool HasRelocationAddend;
+ uint8_t OSABI = ELF::ELFOSABI_NONE;
public:
ELFAMDGPUAsmBackend(const Target &T, const Triple &TT) :
AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn),
- HasRelocationAddend(TT.getOS() == Triple::AMDHSA) { }
+ HasRelocationAddend(TT.getOS() == Triple::AMDHSA) {
+ switch (TT.getOS()) {
+ case Triple::AMDHSA:
+ OSABI = ELF::ELFOSABI_AMDGPU_HSA;
+ break;
+ case Triple::AMDPAL:
+ OSABI = ELF::ELFOSABI_AMDGPU_PAL;
+ break;
+ case Triple::Mesa3D:
+ OSABI = ELF::ELFOSABI_AMDGPU_MESA3D;
+ break;
+ default:
+ break;
+ }
+ }
- MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
- return createAMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend, OS);
+ std::unique_ptr<MCObjectWriter>
+ createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend, OS);
}
};
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 6abe7f3d37d5..e443b0729606 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -12,6 +12,7 @@
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/ErrorHandling.h"
@@ -22,7 +23,7 @@ namespace {
class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
public:
- AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend);
+ AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend);
protected:
unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
@@ -33,10 +34,9 @@ protected:
} // end anonymous namespace
AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit,
+ uint8_t OSABI,
bool HasRelocationAddend)
- : MCELFObjectTargetWriter(Is64Bit,
- ELF::ELFOSABI_AMDGPU_HSA,
- ELF::EM_AMDGPU,
+ : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_AMDGPU,
HasRelocationAddend) {}
unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
@@ -82,10 +82,11 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
llvm_unreachable("unhandled relocation type");
}
-MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit,
- bool HasRelocationAddend,
- raw_pwrite_stream &OS) {
- MCELFObjectTargetWriter *MOTW =
- new AMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend);
- return createELFObjectWriter(MOTW, OS, true);
+std::unique_ptr<MCObjectWriter>
+llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
+ bool HasRelocationAddend,
+ raw_pwrite_stream &OS) {
+ auto MOTW = llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
+ HasRelocationAddend);
+ return createELFObjectWriter(std::move(MOTW), OS, true);
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
index 43338a5bebd2..1497edc7a054 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -9,13 +9,40 @@
#include "AMDGPUELFStreamer.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCCodeEmitter.h"
using namespace llvm;
-MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context,
- MCAsmBackend &MAB,
- raw_pwrite_stream &OS,
- MCCodeEmitter *Emitter,
- bool RelaxAll) {
- return new AMDGPUELFStreamer(Context, MAB, OS, Emitter);
+AMDGPUELFStreamer::AMDGPUELFStreamer(const Triple &T, MCContext &Context,
+ std::unique_ptr<MCAsmBackend> MAB,
+ raw_pwrite_stream &OS,
+ std::unique_ptr<MCCodeEmitter> Emitter)
+ : MCELFStreamer(Context, std::move(MAB), OS, std::move(Emitter)) {
+ unsigned Arch = ELF::EF_AMDGPU_ARCH_NONE;
+ switch (T.getArch()) {
+ case Triple::r600:
+ Arch = ELF::EF_AMDGPU_ARCH_R600;
+ break;
+ case Triple::amdgcn:
+ Arch = ELF::EF_AMDGPU_ARCH_GCN;
+ break;
+ default:
+ break;
+ }
+
+ MCAssembler &MCA = getAssembler();
+ unsigned EFlags = MCA.getELFHeaderEFlags();
+ EFlags &= ~ELF::EF_AMDGPU_ARCH;
+ EFlags |= Arch;
+ MCA.setELFHeaderEFlags(EFlags);
+}
+
+MCELFStreamer *llvm::createAMDGPUELFStreamer(
+ const Triple &T, MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
+ raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+ bool RelaxAll) {
+ return new AMDGPUELFStreamer(T, Context, std::move(MAB), OS,
+ std::move(Emitter));
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
index 5319b65d65f9..0cc0a4c5cd5d 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -25,15 +25,16 @@ class MCSubtargetInfo;
class AMDGPUELFStreamer : public MCELFStreamer {
public:
- AMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
- MCCodeEmitter *Emitter)
- : MCELFStreamer(Context, MAB, OS, Emitter) { }
-
+ AMDGPUELFStreamer(const Triple &T, MCContext &Context,
+ std::unique_ptr<MCAsmBackend> MAB, raw_pwrite_stream &OS,
+ std::unique_ptr<MCCodeEmitter> Emitter);
};
-MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB,
- raw_pwrite_stream &OS,
- MCCodeEmitter *Emitter, bool RelaxAll);
+MCELFStreamer *createAMDGPUELFStreamer(const Triple &T, MCContext &Context,
+ std::unique_ptr<MCAsmBackend> MAB,
+ raw_pwrite_stream &OS,
+ std::unique_ptr<MCCodeEmitter> Emitter,
+ bool RelaxAll);
} // namespace llvm.
#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp
index 4e828a791e09..463e700f13b7 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp
@@ -1,4 +1,4 @@
-//===--- AMDGPUCodeObjectMetadataStreamer.cpp -------------------*- C++ -*-===//
+//===--- AMDGPUHSAMetadataStreamer.cpp --------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -8,12 +8,12 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU Code Object Metadata Streamer.
+/// \brief AMDGPU HSA Metadata Streamer.
///
//
//===----------------------------------------------------------------------===//
-#include "AMDGPUCodeObjectMetadataStreamer.h"
+#include "AMDGPUHSAMetadataStreamer.h"
#include "AMDGPU.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/IR/Constants.h"
@@ -22,39 +22,40 @@
namespace llvm {
-static cl::opt<bool> DumpCodeObjectMetadata(
- "amdgpu-dump-comd",
- cl::desc("Dump AMDGPU Code Object Metadata"));
-static cl::opt<bool> VerifyCodeObjectMetadata(
- "amdgpu-verify-comd",
- cl::desc("Verify AMDGPU Code Object Metadata"));
+static cl::opt<bool> DumpHSAMetadata(
+ "amdgpu-dump-hsa-metadata",
+ cl::desc("Dump AMDGPU HSA Metadata"));
+static cl::opt<bool> VerifyHSAMetadata(
+ "amdgpu-verify-hsa-metadata",
+ cl::desc("Verify AMDGPU HSA Metadata"));
namespace AMDGPU {
-namespace CodeObject {
+namespace HSAMD {
-void MetadataStreamer::dump(StringRef YamlString) const {
- errs() << "AMDGPU Code Object Metadata:\n" << YamlString << '\n';
+void MetadataStreamer::dump(StringRef HSAMetadataString) const {
+ errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
}
-void MetadataStreamer::verify(StringRef YamlString) const {
- errs() << "AMDGPU Code Object Metadata Parser Test: ";
+void MetadataStreamer::verify(StringRef HSAMetadataString) const {
+ errs() << "AMDGPU HSA Metadata Parser Test: ";
- CodeObject::Metadata FromYamlString;
- if (Metadata::fromYamlString(YamlString, FromYamlString)) {
+ HSAMD::Metadata FromHSAMetadataString;
+ if (fromString(HSAMetadataString, FromHSAMetadataString)) {
errs() << "FAIL\n";
return;
}
- std::string ToYamlString;
- if (Metadata::toYamlString(FromYamlString, ToYamlString)) {
+ std::string ToHSAMetadataString;
+ if (toString(FromHSAMetadataString, ToHSAMetadataString)) {
errs() << "FAIL\n";
return;
}
- errs() << (YamlString == ToYamlString ? "PASS" : "FAIL") << '\n';
- if (YamlString != ToYamlString) {
- errs() << "Original input: " << YamlString << '\n'
- << "Produced output: " << ToYamlString << '\n';
+ errs() << (HSAMetadataString == ToHSAMetadataString ? "PASS" : "FAIL")
+ << '\n';
+ if (HSAMetadataString != ToHSAMetadataString) {
+ errs() << "Original input: " << HSAMetadataString << '\n'
+ << "Produced output: " << ToHSAMetadataString << '\n';
}
}
@@ -196,14 +197,14 @@ std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
}
void MetadataStreamer::emitVersion() {
- auto &Version = CodeObjectMetadata.mVersion;
+ auto &Version = HSAMetadata.mVersion;
- Version.push_back(MetadataVersionMajor);
- Version.push_back(MetadataVersionMinor);
+ Version.push_back(VersionMajor);
+ Version.push_back(VersionMinor);
}
void MetadataStreamer::emitPrintf(const Module &Mod) {
- auto &Printf = CodeObjectMetadata.mPrintf;
+ auto &Printf = HSAMetadata.mPrintf;
auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
if (!Node)
@@ -215,7 +216,7 @@ void MetadataStreamer::emitPrintf(const Module &Mod) {
}
void MetadataStreamer::emitKernelLanguage(const Function &Func) {
- auto &Kernel = CodeObjectMetadata.mKernels.back();
+ auto &Kernel = HSAMetadata.mKernels.back();
// TODO: What about other languages?
auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
@@ -233,7 +234,7 @@ void MetadataStreamer::emitKernelLanguage(const Function &Func) {
}
void MetadataStreamer::emitKernelAttrs(const Function &Func) {
- auto &Attrs = CodeObjectMetadata.mKernels.back().mAttrs;
+ auto &Attrs = HSAMetadata.mKernels.back().mAttrs;
if (auto Node = Func.getMetadata("reqd_work_group_size"))
Attrs.mReqdWorkGroupSize = getWorkGroupDimensions(Node);
@@ -244,6 +245,10 @@ void MetadataStreamer::emitKernelAttrs(const Function &Func) {
cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue());
}
+ if (Func.hasFnAttribute("runtime-handle")) {
+ Attrs.mRuntimeHandle =
+ Func.getFnAttribute("runtime-handle").getValueAsString().str();
+ }
}
void MetadataStreamer::emitKernelArgs(const Function &Func) {
@@ -261,12 +266,21 @@ void MetadataStreamer::emitKernelArgs(const Function &Func) {
emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
- if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
- return;
-
auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
AMDGPUASI.GLOBAL_ADDRESS);
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+ auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts");
+ if (CallsPrintf)
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+ if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+ if (!CallsPrintf) {
+ // Emit a dummy argument so that the remaining hidden arguments
+ // have a fixed position relative to the first hidden argument.
+ // This is to facilitate library code to access hidden arguments.
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+ }
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
+ }
}
void MetadataStreamer::emitKernelArg(const Argument &Arg) {
@@ -274,10 +288,17 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) {
auto ArgNo = Arg.getArgNo();
const MDNode *Node;
- StringRef TypeQual;
- Node = Func->getMetadata("kernel_arg_type_qual");
+ StringRef Name;
+ Node = Func->getMetadata("kernel_arg_name");
if (Node && ArgNo < Node->getNumOperands())
- TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+ Name = cast<MDString>(Node->getOperand(ArgNo))->getString();
+ else if (Arg.hasName())
+ Name = Arg.getName();
+
+ StringRef TypeName;
+ Node = Func->getMetadata("kernel_arg_type");
+ if (Node && ArgNo < Node->getNumOperands())
+ TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
StringRef BaseTypeName;
Node = Func->getMetadata("kernel_arg_base_type");
@@ -294,28 +315,25 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) {
AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
}
- StringRef Name;
- Node = Func->getMetadata("kernel_arg_name");
- if (Node && ArgNo < Node->getNumOperands())
- Name = cast<MDString>(Node->getOperand(ArgNo))->getString();
-
- StringRef TypeName;
- Node = Func->getMetadata("kernel_arg_type");
+ StringRef TypeQual;
+ Node = Func->getMetadata("kernel_arg_type_qual");
if (Node && ArgNo < Node->getNumOperands())
- TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+ TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
- getValueKind(Arg.getType(), TypeQual, BaseTypeName), TypeQual,
- BaseTypeName, AccQual, Name, TypeName);
+ getValueKind(Arg.getType(), TypeQual, BaseTypeName), Name,
+ TypeName, BaseTypeName, AccQual, TypeQual);
}
void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
- ValueKind ValueKind, StringRef TypeQual,
- StringRef BaseTypeName, StringRef AccQual,
- StringRef Name, StringRef TypeName) {
- CodeObjectMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
- auto &Arg = CodeObjectMetadata.mKernels.back().mArgs.back();
+ ValueKind ValueKind, StringRef Name,
+ StringRef TypeName, StringRef BaseTypeName,
+ StringRef AccQual, StringRef TypeQual) {
+ HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
+ auto &Arg = HSAMetadata.mKernels.back().mArgs.back();
+ Arg.mName = Name;
+ Arg.mTypeName = TypeName;
Arg.mSize = DL.getTypeAllocSize(Ty);
Arg.mAlign = DL.getABITypeAlignment(Ty);
Arg.mValueKind = ValueKind;
@@ -327,62 +345,25 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
Arg.mPointeeAlign = DL.getABITypeAlignment(ElTy);
}
- Arg.mAccQual = getAccessQualifier(AccQual);
-
if (auto PtrTy = dyn_cast<PointerType>(Ty))
Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace());
+ Arg.mAccQual = getAccessQualifier(AccQual);
+
+ // TODO: Emit Arg.mActualAccQual.
+
SmallVector<StringRef, 1> SplitTypeQuals;
TypeQual.split(SplitTypeQuals, " ", -1, false);
for (StringRef Key : SplitTypeQuals) {
auto P = StringSwitch<bool*>(Key)
.Case("const", &Arg.mIsConst)
- .Case("pipe", &Arg.mIsPipe)
.Case("restrict", &Arg.mIsRestrict)
.Case("volatile", &Arg.mIsVolatile)
+ .Case("pipe", &Arg.mIsPipe)
.Default(nullptr);
if (P)
*P = true;
}
-
- Arg.mName = Name;
- Arg.mTypeName = TypeName;
-}
-
-void MetadataStreamer::emitKernelCodeProps(
- const amd_kernel_code_t &KernelCode) {
- auto &CodeProps = CodeObjectMetadata.mKernels.back().mCodeProps;
-
- CodeProps.mKernargSegmentSize = KernelCode.kernarg_segment_byte_size;
- CodeProps.mWorkgroupGroupSegmentSize =
- KernelCode.workgroup_group_segment_byte_size;
- CodeProps.mWorkitemPrivateSegmentSize =
- KernelCode.workitem_private_segment_byte_size;
- CodeProps.mWavefrontNumSGPRs = KernelCode.wavefront_sgpr_count;
- CodeProps.mWorkitemNumVGPRs = KernelCode.workitem_vgpr_count;
- CodeProps.mKernargSegmentAlign = KernelCode.kernarg_segment_alignment;
- CodeProps.mGroupSegmentAlign = KernelCode.group_segment_alignment;
- CodeProps.mPrivateSegmentAlign = KernelCode.private_segment_alignment;
- CodeProps.mWavefrontSize = KernelCode.wavefront_size;
-}
-
-void MetadataStreamer::emitKernelDebugProps(
- const amd_kernel_code_t &KernelCode) {
- if (!(KernelCode.code_properties & AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED))
- return;
-
- auto &DebugProps = CodeObjectMetadata.mKernels.back().mDebugProps;
-
- // FIXME: Need to pass down debugger ABI version through features. This is ok
- // for now because we only have one version.
- DebugProps.mDebuggerABIVersion.push_back(1);
- DebugProps.mDebuggerABIVersion.push_back(0);
- DebugProps.mReservedNumVGPRs = KernelCode.reserved_vgpr_count;
- DebugProps.mReservedFirstVGPR = KernelCode.reserved_vgpr_first;
- DebugProps.mPrivateSegmentBufferSGPR =
- KernelCode.debug_private_segment_buffer_sgpr;
- DebugProps.mWavefrontPrivateSegmentOffsetSGPR =
- KernelCode.debug_wavefront_private_segment_offset_sgpr;
}
void MetadataStreamer::begin(const Module &Mod) {
@@ -391,42 +372,36 @@ void MetadataStreamer::begin(const Module &Mod) {
emitPrintf(Mod);
}
-void MetadataStreamer::emitKernel(const Function &Func,
- const amd_kernel_code_t &KernelCode) {
+void MetadataStreamer::end() {
+ std::string HSAMetadataString;
+ if (toString(HSAMetadata, HSAMetadataString))
+ return;
+
+ if (DumpHSAMetadata)
+ dump(HSAMetadataString);
+ if (VerifyHSAMetadata)
+ verify(HSAMetadataString);
+}
+
+void MetadataStreamer::emitKernel(
+ const Function &Func,
+ const Kernel::CodeProps::Metadata &CodeProps,
+ const Kernel::DebugProps::Metadata &DebugProps) {
if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
return;
- CodeObjectMetadata.mKernels.push_back(Kernel::Metadata());
- auto &Kernel = CodeObjectMetadata.mKernels.back();
+ HSAMetadata.mKernels.push_back(Kernel::Metadata());
+ auto &Kernel = HSAMetadata.mKernels.back();
Kernel.mName = Func.getName();
+ Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str();
emitKernelLanguage(Func);
emitKernelAttrs(Func);
emitKernelArgs(Func);
- emitKernelCodeProps(KernelCode);
- emitKernelDebugProps(KernelCode);
-}
-
-ErrorOr<std::string> MetadataStreamer::toYamlString() {
- std::string YamlString;
- if (auto Error = Metadata::toYamlString(CodeObjectMetadata, YamlString))
- return Error;
-
- if (DumpCodeObjectMetadata)
- dump(YamlString);
- if (VerifyCodeObjectMetadata)
- verify(YamlString);
-
- return YamlString;
-}
-
-ErrorOr<std::string> MetadataStreamer::toYamlString(StringRef YamlString) {
- if (auto Error = Metadata::fromYamlString(YamlString, CodeObjectMetadata))
- return Error;
-
- return toYamlString();
+ HSAMetadata.mKernels.back().mCodeProps = CodeProps;
+ HSAMetadata.mKernels.back().mDebugProps = DebugProps;
}
-} // end namespace CodeObject
+} // end namespace HSAMD
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h
index c6681431d74d..bd6515521a74 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h
@@ -1,4 +1,4 @@
-//===--- AMDGPUCodeObjectMetadataStreamer.h ---------------------*- C++ -*-===//
+//===--- AMDGPUHSAMetadataStreamer.h ----------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -8,19 +8,18 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief AMDGPU Code Object Metadata Streamer.
+/// \brief AMDGPU HSA Metadata Streamer.
///
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H
-#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
#include "AMDGPU.h"
#include "AMDKernelCodeT.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/AMDGPUCodeObjectMetadata.h"
-#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/AMDGPUMetadata.h"
namespace llvm {
@@ -32,16 +31,16 @@ class Module;
class Type;
namespace AMDGPU {
-namespace CodeObject {
+namespace HSAMD {
class MetadataStreamer final {
private:
- Metadata CodeObjectMetadata;
+ Metadata HSAMetadata;
AMDGPUAS AMDGPUASI;
- void dump(StringRef YamlString) const;
+ void dump(StringRef HSAMetadataString) const;
- void verify(StringRef YamlString) const;
+ void verify(StringRef HSAMetadataString) const;
AccessQualifier getAccessQualifier(StringRef AccQual) const;
@@ -69,31 +68,29 @@ private:
void emitKernelArg(const Argument &Arg);
void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind,
- StringRef TypeQual = "", StringRef BaseTypeName = "",
- StringRef AccQual = "", StringRef Name = "",
- StringRef TypeName = "");
-
- void emitKernelCodeProps(const amd_kernel_code_t &KernelCode);
-
- void emitKernelDebugProps(const amd_kernel_code_t &KernelCode);
+ StringRef Name = "", StringRef TypeName = "",
+ StringRef BaseTypeName = "", StringRef AccQual = "",
+ StringRef TypeQual = "");
public:
MetadataStreamer() = default;
~MetadataStreamer() = default;
- void begin(const Module &Mod);
+ const Metadata &getHSAMetadata() const {
+ return HSAMetadata;
+ }
- void end() {}
-
- void emitKernel(const Function &Func, const amd_kernel_code_t &KernelCode);
+ void begin(const Module &Mod);
- ErrorOr<std::string> toYamlString();
+ void end();
- ErrorOr<std::string> toYamlString(StringRef YamlString);
+ void emitKernel(const Function &Func,
+ const Kernel::CodeProps::Metadata &CodeProps,
+ const Kernel::DebugProps::Metadata &DebugProps);
};
-} // end namespace CodeObject
+} // end namespace HSAMD
} // end namespace AMDGPU
} // end namespace llvm
-#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H
+#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 136e6ec4ceb5..2b321c04fb30 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -18,6 +18,8 @@
#include "AMDGPUTargetStreamer.h"
#include "InstPrinter/AMDGPUInstPrinter.h"
#include "SIDefines.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -60,7 +62,8 @@ static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
const MCAsmInfo &MAI,
const MCInstrInfo &MII,
const MCRegisterInfo &MRI) {
- return new AMDGPUInstPrinter(MAI, MII, MRI);
+ return T.getArch() == Triple::r600 ? new R600InstPrinter(MAI, MII, MRI) :
+ new AMDGPUInstPrinter(MAI, MII, MRI);
}
static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S,
@@ -77,12 +80,12 @@ static MCTargetStreamer * createAMDGPUObjectTargetStreamer(
}
static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
- MCAsmBackend &MAB, raw_pwrite_stream &OS,
- MCCodeEmitter *Emitter, bool RelaxAll) {
- if (T.getOS() == Triple::AMDHSA)
- return createAMDGPUELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
-
- return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+ std::unique_ptr<MCAsmBackend> &&MAB,
+ raw_pwrite_stream &OS,
+ std::unique_ptr<MCCodeEmitter> &&Emitter,
+ bool RelaxAll) {
+ return createAMDGPUELFStreamer(T, Context, std::move(MAB), OS,
+ std::move(Emitter), RelaxAll);
}
extern "C" void LLVMInitializeAMDGPUTargetMC() {
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index f80b5f3a6dba..0b3563303ad0 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -18,6 +18,8 @@
#include "llvm/Support/DataTypes.h"
+#include <memory>
+
namespace llvm {
class MCAsmBackend;
class MCCodeEmitter;
@@ -47,9 +49,9 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
const Triple &TT, StringRef CPU,
const MCTargetOptions &Options);
-MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit,
- bool HasRelocationAddend,
- raw_pwrite_stream &OS);
+std::unique_ptr<MCObjectWriter>
+createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
+ bool HasRelocationAddend, raw_pwrite_stream &OS);
} // End llvm namespace
#define GET_REGINFO_ENUM
@@ -58,7 +60,9 @@ MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit,
#define GET_INSTRINFO_ENUM
#define GET_INSTRINFO_OPERAND_ENUM
+#define GET_INSTRINFO_SCHED_ENUM
#include "AMDGPUGenInstrInfo.inc"
+#undef GET_INSTRINFO_SCHED_ENUM
#undef GET_INSTRINFO_OPERAND_ENUM
#undef GET_INSTRINFO_ENUM
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 2a0032fc9adc..d897956daccf 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -39,21 +39,12 @@ using namespace llvm::AMDGPU;
// AMDGPUTargetStreamer
//===----------------------------------------------------------------------===//
-AMDGPUTargetStreamer::AMDGPUTargetStreamer(MCStreamer &S)
- : MCTargetStreamer(S) {}
-
-void AMDGPUTargetStreamer::EmitStartOfCodeObjectMetadata(const Module &Mod) {
- CodeObjectMetadataStreamer.begin(Mod);
-}
-
-void AMDGPUTargetStreamer::EmitKernelCodeObjectMetadata(
- const Function &Func, const amd_kernel_code_t &KernelCode) {
- CodeObjectMetadataStreamer.emitKernel(Func, KernelCode);
-}
+bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) {
+ HSAMD::Metadata HSAMetadata;
+ if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
+ return false;
-void AMDGPUTargetStreamer::EmitEndOfCodeObjectMetadata() {
- CodeObjectMetadataStreamer.end();
- EmitCodeObjectMetadata(CodeObjectMetadataStreamer.toYamlString().get());
+ return EmitHSAMetadata(HSAMetadata);
}
//===----------------------------------------------------------------------===//
@@ -100,15 +91,30 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
}
}
-bool AMDGPUTargetAsmStreamer::EmitCodeObjectMetadata(StringRef YamlString) {
- auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString);
- if (!VerifiedYamlString)
+bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) {
+ OS << "\t.amd_amdgpu_isa \"" << IsaVersionString << "\"\n";
+ return true;
+}
+
+bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
+ const AMDGPU::HSAMD::Metadata &HSAMetadata) {
+ std::string HSAMetadataString;
+ if (HSAMD::toString(HSAMetadata, HSAMetadataString))
return false;
- OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin << '\n';
- OS << VerifiedYamlString.get();
- OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd << '\n';
+ OS << '\t' << HSAMD::AssemblerDirectiveBegin << '\n';
+ OS << HSAMetadataString << '\n';
+ OS << '\t' << HSAMD::AssemblerDirectiveEnd << '\n';
+ return true;
+}
+bool AMDGPUTargetAsmStreamer::EmitPALMetadata(
+ const PALMD::Metadata &PALMetadata) {
+ std::string PALMetadataString;
+ if (PALMD::toString(PALMetadata, PALMetadataString))
+ return false;
+
+ OS << '\t' << PALMD::AssemblerDirective << PALMetadataString << '\n';
return true;
}
@@ -124,7 +130,7 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
}
void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
- const MCExpr *DescSZ, ElfNote::NoteType Type,
+ const MCExpr *DescSZ, unsigned NoteType,
function_ref<void(MCELFStreamer &)> EmitDesc) {
auto &S = getStreamer();
auto &Context = S.getContext();
@@ -136,7 +142,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
ElfNote::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC));
S.EmitIntValue(NameSZ, 4); // namesz
S.EmitValue(DescSZ, 4); // descz
- S.EmitIntValue(Type, 4); // type
+ S.EmitIntValue(NoteType, 4); // type
S.EmitBytes(StringRef(ElfNote::NoteName, NameSZ)); // name
S.EmitValueToAlignment(4, 0, 1, 0); // padding 0
EmitDesc(S); // desc
@@ -204,9 +210,32 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
}
-bool AMDGPUTargetELFStreamer::EmitCodeObjectMetadata(StringRef YamlString) {
- auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString);
- if (!VerifiedYamlString)
+bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
+ // Create two labels to mark the beginning and end of the desc field
+ // and a MCExpr to calculate the size of the desc field.
+ auto &Context = getContext();
+ auto *DescBegin = Context.createTempSymbol();
+ auto *DescEnd = Context.createTempSymbol();
+ auto *DescSZ = MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(DescEnd, Context),
+ MCSymbolRefExpr::create(DescBegin, Context), Context);
+
+ EmitAMDGPUNote(
+ DescSZ,
+ ELF::NT_AMD_AMDGPU_ISA,
+ [&](MCELFStreamer &OS) {
+ OS.EmitLabel(DescBegin);
+ OS.EmitBytes(IsaVersionString);
+ OS.EmitLabel(DescEnd);
+ }
+ );
+ return true;
+}
+
+bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
+ const AMDGPU::HSAMD::Metadata &HSAMetadata) {
+ std::string HSAMetadataString;
+ if (HSAMD::toString(HSAMetadata, HSAMetadataString))
return false;
// Create two labels to mark the beginning and end of the desc field
@@ -220,13 +249,25 @@ bool AMDGPUTargetELFStreamer::EmitCodeObjectMetadata(StringRef YamlString) {
EmitAMDGPUNote(
DescSZ,
- ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_METADATA,
+ ELF::NT_AMD_AMDGPU_HSA_METADATA,
[&](MCELFStreamer &OS) {
OS.EmitLabel(DescBegin);
- OS.EmitBytes(VerifiedYamlString.get());
+ OS.EmitBytes(HSAMetadataString);
OS.EmitLabel(DescEnd);
}
);
+ return true;
+}
+bool AMDGPUTargetELFStreamer::EmitPALMetadata(
+ const PALMD::Metadata &PALMetadata) {
+ EmitAMDGPUNote(
+ MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t), getContext()),
+ ELF::NT_AMD_AMDGPU_PAL_METADATA,
+ [&](MCELFStreamer &OS){
+ for (auto I : PALMetadata)
+ OS.EmitIntValue(I, sizeof(uint32_t));
+ }
+ );
return true;
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 968128e94d0b..0919b754480d 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -10,9 +10,10 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
-#include "AMDGPUCodeObjectMetadataStreamer.h"
#include "AMDKernelCodeT.h"
#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/AMDGPUMetadata.h"
namespace llvm {
#include "AMDGPUPTNote.h"
@@ -27,11 +28,11 @@ class Type;
class AMDGPUTargetStreamer : public MCTargetStreamer {
protected:
- AMDGPU::CodeObject::MetadataStreamer CodeObjectMetadataStreamer;
MCContext &getContext() const { return Streamer.getContext(); }
public:
- AMDGPUTargetStreamer(MCStreamer &S);
+ AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) = 0;
@@ -44,15 +45,17 @@ public:
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
- virtual void EmitStartOfCodeObjectMetadata(const Module &Mod);
+ /// \returns True on success, false on failure.
+ virtual bool EmitISAVersion(StringRef IsaVersionString) = 0;
- virtual void EmitKernelCodeObjectMetadata(
- const Function &Func, const amd_kernel_code_t &KernelCode);
+ /// \returns True on success, false on failure.
+ virtual bool EmitHSAMetadata(StringRef HSAMetadataString);
- virtual void EmitEndOfCodeObjectMetadata();
+ /// \returns True on success, false on failure.
+ virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0;
/// \returns True on success, false on failure.
- virtual bool EmitCodeObjectMetadata(StringRef YamlString) = 0;
+ virtual bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) = 0;
};
class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
@@ -71,14 +74,19 @@ public:
void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
/// \returns True on success, false on failure.
- bool EmitCodeObjectMetadata(StringRef YamlString) override;
+ bool EmitISAVersion(StringRef IsaVersionString) override;
+
+ /// \returns True on success, false on failure.
+ bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
+
+ /// \returns True on success, false on failure.
+ bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
};
class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
MCStreamer &Streamer;
- void EmitAMDGPUNote(const MCExpr *DescSize,
- AMDGPU::ElfNote::NoteType Type,
+ void EmitAMDGPUNote(const MCExpr *DescSize, unsigned NoteType,
function_ref<void(MCELFStreamer &)> EmitDesc);
public:
@@ -98,7 +106,13 @@ public:
void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
/// \returns True on success, false on failure.
- bool EmitCodeObjectMetadata(StringRef YamlString) override;
+ bool EmitISAVersion(StringRef IsaVersionString) override;
+
+ /// \returns True on success, false on failure.
+ bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
+
+ /// \returns True on success, false on failure.
+ bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
};
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
index 09e3efad10af..f9cb4678dc51 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
@@ -1,8 +1,8 @@
add_llvm_library(LLVMAMDGPUDesc
AMDGPUAsmBackend.cpp
- AMDGPUCodeObjectMetadataStreamer.cpp
AMDGPUELFObjectWriter.cpp
AMDGPUELFStreamer.cpp
+ AMDGPUHSAMetadataStreamer.cpp
AMDGPUMCAsmInfo.cpp
AMDGPUMCCodeEmitter.cpp
AMDGPUMCTargetDesc.cpp
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 376c9bfe5ccf..94c0157edeb5 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -278,7 +278,7 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
return;
// Check for additional literals in SRC0/1/2 (Op 1/2/3)
- for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
+ for (unsigned i = 0, e = Desc.getNumOperands(); i < e; ++i) {
// Check if this operand should be encoded as [SV]Src
if (!AMDGPU::isSISrcOperand(Desc, i))
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 06e2c11b0193..30a2df510386 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -63,13 +63,13 @@ multiclass MIMG_NoSampler <bits<7> op, string asm> {
class MIMG_Store_Helper <bits<7> op, string asm,
RegisterClass data_rc,
- RegisterClass addr_rc> : MIMG_Helper <
+ RegisterClass addr_rc,
+ string dns = ""> : MIMG_Helper <
(outs),
(ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
- >, MIMGe<op> {
+ asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", dns>, MIMGe<op> {
let ssamp = 0;
let mayLoad = 1; // TableGen requires this for matching with the intrinsics
let mayStore = 1;
@@ -81,7 +81,8 @@ class MIMG_Store_Helper <bits<7> op, string asm,
multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
RegisterClass data_rc,
int channels> {
- def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32>,
+ def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
+ !if(!eq(channels, 1), "AMDGPU", "")>,
MIMG_Mask<asm#"_V1", channels>;
def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>,
MIMG_Mask<asm#"_V2", channels>;
@@ -257,7 +258,11 @@ defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">;
defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">;
//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
+
+let mayLoad = 0, mayStore = 0 in {
defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
+}
+
defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>;
defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
@@ -331,7 +336,11 @@ defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
+
+let mayLoad = 0, mayStore = 0 in {
defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
+}
+
defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">;
defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
@@ -349,7 +358,7 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o"
/********** ======================= **********/
// Image + sampler
-class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
(name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
(opcode $addr, $rsrc, $sampler,
@@ -371,7 +380,7 @@ multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
// 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128).
// 3. Add A16 support when we pass address of half type.
multiclass AMDGCNSamplePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
- def : Pat<
+ def : GCNPat<
(dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc,
i1:$slc, i1:$lwe, i1:$da)),
(opcode $addr, $rsrc, $sampler,
@@ -396,7 +405,7 @@ multiclass AMDGCNSamplePatterns<SDPatternOperator name, string opcode> {
}
// Image only
-class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
(name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm,
imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe),
(opcode $addr, $rsrc,
@@ -411,7 +420,7 @@ multiclass ImagePatterns<SDPatternOperator name, string opcode> {
}
multiclass ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
- def : Pat <
+ def : GCNPat <
(dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe,
i1:$da)),
(opcode $addr, $rsrc,
@@ -434,7 +443,7 @@ multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> {
}
multiclass ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
- def : Pat <
+ def : GCNPat <
(name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc,
i1:$lwe, i1:$da),
(opcode $data, $addr, $rsrc,
@@ -456,7 +465,7 @@ multiclass ImageStorePatterns<SDPatternOperator name, string opcode> {
defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
}
-class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
(name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc),
(opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da))
>;
@@ -467,7 +476,7 @@ multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> {
def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>;
}
-class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat <
+class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : GCNPat <
(int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc,
imm:$r128, imm:$da, imm:$slc),
(EXTRACT_SUBREG
@@ -584,34 +593,34 @@ defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">;
defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">;
/* SIsample for simple 1D texture lookup */
-def : Pat <
+def : GCNPat <
(SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
(IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
>;
-class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
(name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
(opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
>;
-class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
(name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT),
(opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0)
>;
-class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
(name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY),
(opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
>;
class SampleShadowPattern<SDNode name, MIMG opcode,
- ValueType vt> : Pat <
+ ValueType vt> : GCNPat <
(name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW),
(opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
>;
class SampleShadowArrayPattern<SDNode name, MIMG opcode,
- ValueType vt> : Pat <
+ ValueType vt> : GCNPat <
(name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
(opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
>;
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
index d30d1d382588..d50dae78e247 100644
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@@ -1,4 +1,4 @@
-//===-- Processors.td - R600 Processor definitions ------------------------===//
+//===-- Processors.td - AMDGPU Processor definitions ----------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,217 +7,6 @@
//
//===----------------------------------------------------------------------===//
-class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
-: Processor<Name, itin, Features>;
-
-//===----------------------------------------------------------------------===//
-// R600
-//===----------------------------------------------------------------------===//
-def : Proc<"r600", R600_VLIW5_Itin,
- [FeatureR600, FeatureVertexCache, FeatureWavefrontSize64]>;
-
-def : Proc<"r630", R600_VLIW5_Itin,
- [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>;
-
-def : Proc<"rs880", R600_VLIW5_Itin,
- [FeatureR600, FeatureWavefrontSize16]>;
-
-def : Proc<"rv670", R600_VLIW5_Itin,
- [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
-
-//===----------------------------------------------------------------------===//
-// R700
-//===----------------------------------------------------------------------===//
-
-def : Proc<"rv710", R600_VLIW5_Itin,
- [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
-
-def : Proc<"rv730", R600_VLIW5_Itin,
- [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
-
-def : Proc<"rv770", R600_VLIW5_Itin,
- [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
-
-//===----------------------------------------------------------------------===//
-// Evergreen
-//===----------------------------------------------------------------------===//
-
-def : Proc<"cedar", R600_VLIW5_Itin,
- [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32,
- FeatureCFALUBug]>;
-
-def : Proc<"redwood", R600_VLIW5_Itin,
- [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64,
- FeatureCFALUBug]>;
-
-def : Proc<"sumo", R600_VLIW5_Itin,
- [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>;
-
-def : Proc<"juniper", R600_VLIW5_Itin,
- [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
-
-def : Proc<"cypress", R600_VLIW5_Itin,
- [FeatureEvergreen, FeatureFP64, FeatureVertexCache,
- FeatureWavefrontSize64]>;
-
-//===----------------------------------------------------------------------===//
-// Northern Islands
-//===----------------------------------------------------------------------===//
-
-def : Proc<"barts", R600_VLIW5_Itin,
- [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
-
-def : Proc<"turks", R600_VLIW5_Itin,
- [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
-
-def : Proc<"caicos", R600_VLIW5_Itin,
- [FeatureNorthernIslands, FeatureCFALUBug]>;
-
-def : Proc<"cayman", R600_VLIW4_Itin,
- [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>;
-
-//===----------------------------------------------------------------------===//
-// Southern Islands
-//===----------------------------------------------------------------------===//
-
-def : ProcessorModel<"gfx600", SIFullSpeedModel,
- [FeatureISAVersion6_0_0]>;
-
-def : ProcessorModel<"SI", SIFullSpeedModel,
- [FeatureISAVersion6_0_0]
->;
-
-def : ProcessorModel<"tahiti", SIFullSpeedModel,
- [FeatureISAVersion6_0_0]
->;
-
-def : ProcessorModel<"gfx601", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]
->;
-
-def : ProcessorModel<"pitcairn", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]>;
-
-def : ProcessorModel<"verde", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]>;
-
-def : ProcessorModel<"oland", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]>;
-
-def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureISAVersion6_0_1]>;
-
-//===----------------------------------------------------------------------===//
-// Sea Islands
-//===----------------------------------------------------------------------===//
-
-def : ProcessorModel<"gfx700", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_0]
->;
-
-def : ProcessorModel<"bonaire", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_0]
->;
-
-def : ProcessorModel<"kaveri", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_0]
->;
-
-def : ProcessorModel<"gfx701", SIFullSpeedModel,
- [FeatureISAVersion7_0_1]
->;
-
-def : ProcessorModel<"hawaii", SIFullSpeedModel,
- [FeatureISAVersion7_0_1]
->;
-
-def : ProcessorModel<"gfx702", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_2]
->;
-
-def : ProcessorModel<"gfx703", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_3]
->;
-
-def : ProcessorModel<"kabini", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_3]
->;
-
-def : ProcessorModel<"mullins", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_3]>;
-
-//===----------------------------------------------------------------------===//
-// Volcanic Islands
-//===----------------------------------------------------------------------===//
-
-def : ProcessorModel<"tonga", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_2]
->;
-
-def : ProcessorModel<"iceland", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_0]
->;
-
-def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_1]
->;
-
-def : ProcessorModel<"fiji", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_3]
->;
-
-def : ProcessorModel<"stoney", SIQuarterSpeedModel,
- [FeatureISAVersion8_1_0]
->;
-
-def : ProcessorModel<"polaris10", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_3]
->;
-
-def : ProcessorModel<"polaris11", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_3]
->;
-
-def : ProcessorModel<"gfx800", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_0]
->;
-
-def : ProcessorModel<"gfx801", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_1]
->;
-
-def : ProcessorModel<"gfx802", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_2]
->;
-
-def : ProcessorModel<"gfx803", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_3]
->;
-
-def : ProcessorModel<"gfx804", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_4]
->;
-
-def : ProcessorModel<"gfx810", SIQuarterSpeedModel,
- [FeatureISAVersion8_1_0]
->;
-
-//===----------------------------------------------------------------------===//
-// GFX9
-//===----------------------------------------------------------------------===//
-
-def : ProcessorModel<"gfx900", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_0]
->;
-
-def : ProcessorModel<"gfx901", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_1]
->;
-
-def : ProcessorModel<"gfx902", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_2]
->;
-
-def : ProcessorModel<"gfx903", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_3]
->;
-
+FIXME: Deleting this file broke buildbots that don't do full rebuilds. This
+file is no longer used by the backend, so it can be deleted once all
+the buildbots update there dependencies.
diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index fbe45cb222d9..5e1ba6b506da 100644
--- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -44,7 +44,6 @@ static bool isCFAlu(const MachineInstr &MI) {
class R600ClauseMergePass : public MachineFunctionPass {
private:
- static char ID;
const R600InstrInfo *TII;
unsigned getCFAluSize(const MachineInstr &MI) const;
@@ -62,6 +61,8 @@ private:
const MachineInstr &LatrCFAlu) const;
public:
+ static char ID;
+
R600ClauseMergePass() : MachineFunctionPass(ID) { }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -69,8 +70,17 @@ public:
StringRef getPassName() const override;
};
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(R600ClauseMergePass, DEBUG_TYPE,
+ "R600 Clause Merge", false, false)
+INITIALIZE_PASS_END(R600ClauseMergePass, DEBUG_TYPE,
+ "R600 Clause Merge", false, false)
+
char R600ClauseMergePass::ID = 0;
+char &llvm::R600ClauseMergePassID = R600ClauseMergePass::ID;
+
unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const {
assert(isCFAlu(MI));
return MI
@@ -170,7 +180,7 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu,
}
bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
- if (skipFunction(*MF.getFunction()))
+ if (skipFunction(MF.getFunction()))
return false;
const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
@@ -205,9 +215,6 @@ StringRef R600ClauseMergePass::getPassName() const {
return "R600 Merge Clause Markers Pass";
}
-} // end anonymous namespace
-
-
llvm::FunctionPass *llvm::createR600ClauseMergePass() {
return new R600ClauseMergePass();
}
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 00cbd24b84fb..0e788df1c9c0 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -1,4 +1,4 @@
-//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
+//===- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst ----------===//
//
// The LLVM Compiler Infrastructure
//
@@ -9,7 +9,8 @@
//
/// \file
/// This pass compute turns all control flow pseudo instructions into native one
-/// computing their address on the fly ; it also sets STACK_SIZE info.
+/// computing their address on the fly; it also sets STACK_SIZE info.
+//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
@@ -29,13 +30,15 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
-#include <new>
#include <set>
#include <utility>
#include <vector>
@@ -47,7 +50,6 @@ using namespace llvm;
namespace {
struct CFStack {
-
enum StackItem {
ENTRY = 0,
SUB_ENTRY = 1,
@@ -214,7 +216,7 @@ void CFStack::popLoop() {
class R600ControlFlowFinalizer : public MachineFunctionPass {
private:
- typedef std::pair<MachineInstr *, std::vector<MachineInstr *>> ClauseFile;
+ using ClauseFile = std::pair<MachineInstr *, std::vector<MachineInstr *>>;
enum ControlFlowInstruction {
CF_TC,
@@ -230,7 +232,6 @@ private:
CF_END
};
- static char ID;
const R600InstrInfo *TII = nullptr;
const R600RegisterInfo *TRI = nullptr;
unsigned MaxFetchInst;
@@ -499,6 +500,8 @@ private:
}
public:
+ static char ID;
+
R600ControlFlowFinalizer() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override {
@@ -509,14 +512,14 @@ public:
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
- CFStack CFStack(ST, MF.getFunction()->getCallingConv());
+ CFStack CFStack(ST, MF.getFunction().getCallingConv());
for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
++MB) {
MachineBasicBlock &MBB = *MB;
unsigned CfCount = 0;
std::vector<std::pair<unsigned, std::set<MachineInstr *>>> LoopStack;
std::vector<MachineInstr * > IfThenElseStack;
- if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) {
+ if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_VS) {
BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
getHWInstrDesc(CF_CALL_FS));
CfCount++;
@@ -702,9 +705,16 @@ public:
}
};
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(R600ControlFlowFinalizer, DEBUG_TYPE,
+ "R600 Control Flow Finalizer", false, false)
+INITIALIZE_PASS_END(R600ControlFlowFinalizer, DEBUG_TYPE,
+ "R600 Control Flow Finalizer", false, false)
+
char R600ControlFlowFinalizer::ID = 0;
-} // end anonymous namespace
+char &llvm::R600ControlFlowFinalizerID = R600ControlFlowFinalizer::ID;
FunctionPass *llvm::createR600ControlFlowFinalizer() {
return new R600ControlFlowFinalizer();
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 66def2d29caf..ffea231ee4d0 100644
--- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -1,4 +1,4 @@
-//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
+//===- R600ExpandSpecialInstrs.cpp - Expand special instructions ----------===//
//
// The LLVM Compiler Infrastructure
//
@@ -18,27 +18,35 @@
#include "AMDGPUSubtarget.h"
#include "R600Defines.h"
#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Pass.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
using namespace llvm;
+#define DEBUG_TYPE "r600-expand-special-instrs"
+
namespace {
class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
private:
- static char ID;
- const R600InstrInfo *TII;
+ const R600InstrInfo *TII = nullptr;
void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI,
unsigned Op);
public:
- R600ExpandSpecialInstrsPass() : MachineFunctionPass(ID),
- TII(nullptr) { }
+ static char ID;
+
+ R600ExpandSpecialInstrsPass() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -47,10 +55,17 @@ public:
}
};
-} // End anonymous namespace
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(R600ExpandSpecialInstrsPass, DEBUG_TYPE,
+ "R600 Expand Special Instrs", false, false)
+INITIALIZE_PASS_END(R600ExpandSpecialInstrsPass, DEBUG_TYPE,
+ "R600ExpandSpecialInstrs", false, false)
char R600ExpandSpecialInstrsPass::ID = 0;
+char &llvm::R600ExpandSpecialInstrsPassID = R600ExpandSpecialInstrsPass::ID;
+
FunctionPass *llvm::createR600ExpandSpecialInstrsPass() {
return new R600ExpandSpecialInstrsPass();
}
@@ -117,7 +132,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
continue;
}
case AMDGPU::DOT_4: {
-
const R600RegisterInfo &TRI = TII->getRegisterInfo();
unsigned DstReg = MI.getOperand(0).getReg();
diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h
index 142f70967eda..fe367d73682f 100644
--- a/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/lib/Target/AMDGPU/R600FrameLowering.h
@@ -27,6 +27,10 @@ public:
MachineBasicBlock &MBB) const override {}
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
+
+ bool hasFP(const MachineFunction &MF) const override {
+ return false;
+ }
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 69a63b6941ef..66291d0be4e6 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -211,6 +211,11 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+ if (!Subtarget->hasFMA()) {
+ setOperationAction(ISD::FMA, MVT::f32, Expand);
+ setOperationAction(ISD::FMA, MVT::f64, Expand);
+ }
+
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
@@ -1145,7 +1150,9 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
// Load dword
// TODO: can we be smarter about machine pointer info?
- SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo());
+ MachinePointerInfo PtrInfo(UndefValue::get(
+ Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)));
+ SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
Chain = Dst.getValue(1);
@@ -1184,7 +1191,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
// Store dword
// TODO: Can we be smarter about MachinePointerInfo?
- SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo());
+ SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, PtrInfo);
// If we are part of expanded vector, make our neighbors depend on this store
if (VectorTrunc) {
@@ -1308,39 +1315,39 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// return (512 + (kc_bank << 12)
static int
-ConstantAddressBlock(unsigned AddressSpace, AMDGPUAS AMDGPUASI) {
+ConstantAddressBlock(unsigned AddressSpace) {
switch (AddressSpace) {
- case AMDGPUASI.CONSTANT_BUFFER_0:
+ case AMDGPUAS::CONSTANT_BUFFER_0:
return 512;
- case AMDGPUASI.CONSTANT_BUFFER_1:
+ case AMDGPUAS::CONSTANT_BUFFER_1:
return 512 + 4096;
- case AMDGPUASI.CONSTANT_BUFFER_2:
+ case AMDGPUAS::CONSTANT_BUFFER_2:
return 512 + 4096 * 2;
- case AMDGPUASI.CONSTANT_BUFFER_3:
+ case AMDGPUAS::CONSTANT_BUFFER_3:
return 512 + 4096 * 3;
- case AMDGPUASI.CONSTANT_BUFFER_4:
+ case AMDGPUAS::CONSTANT_BUFFER_4:
return 512 + 4096 * 4;
- case AMDGPUASI.CONSTANT_BUFFER_5:
+ case AMDGPUAS::CONSTANT_BUFFER_5:
return 512 + 4096 * 5;
- case AMDGPUASI.CONSTANT_BUFFER_6:
+ case AMDGPUAS::CONSTANT_BUFFER_6:
return 512 + 4096 * 6;
- case AMDGPUASI.CONSTANT_BUFFER_7:
+ case AMDGPUAS::CONSTANT_BUFFER_7:
return 512 + 4096 * 7;
- case AMDGPUASI.CONSTANT_BUFFER_8:
+ case AMDGPUAS::CONSTANT_BUFFER_8:
return 512 + 4096 * 8;
- case AMDGPUASI.CONSTANT_BUFFER_9:
+ case AMDGPUAS::CONSTANT_BUFFER_9:
return 512 + 4096 * 9;
- case AMDGPUASI.CONSTANT_BUFFER_10:
+ case AMDGPUAS::CONSTANT_BUFFER_10:
return 512 + 4096 * 10;
- case AMDGPUASI.CONSTANT_BUFFER_11:
+ case AMDGPUAS::CONSTANT_BUFFER_11:
return 512 + 4096 * 11;
- case AMDGPUASI.CONSTANT_BUFFER_12:
+ case AMDGPUAS::CONSTANT_BUFFER_12:
return 512 + 4096 * 12;
- case AMDGPUASI.CONSTANT_BUFFER_13:
+ case AMDGPUAS::CONSTANT_BUFFER_13:
return 512 + 4096 * 13;
- case AMDGPUASI.CONSTANT_BUFFER_14:
+ case AMDGPUAS::CONSTANT_BUFFER_14:
return 512 + 4096 * 14;
- case AMDGPUASI.CONSTANT_BUFFER_15:
+ case AMDGPUAS::CONSTANT_BUFFER_15:
return 512 + 4096 * 15;
default:
return -1;
@@ -1371,7 +1378,9 @@ SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
// Load dword
// TODO: can we be smarter about machine pointer info?
- SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo());
+ MachinePointerInfo PtrInfo(UndefValue::get(
+ Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)));
+ SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
// Get offset within the register.
SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
@@ -1424,8 +1433,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return scalarizeVectorLoad(LoadNode, DAG);
}
- int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace(),
- AMDGPUASI);
+ int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
if (ConstantBlock > -1 &&
((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
(LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
diff --git a/lib/Target/AMDGPU/R600InstrFormats.td b/lib/Target/AMDGPU/R600InstrFormats.td
index 68fcc545916a..61106ed42e64 100644
--- a/lib/Target/AMDGPU/R600InstrFormats.td
+++ b/lib/Target/AMDGPU/R600InstrFormats.td
@@ -11,9 +11,18 @@
//
//===----------------------------------------------------------------------===//
+def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">;
+
+def isR600toCayman : Predicate<
+ "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">;
+
+class R600Pat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
+ let SubtargetPredicate = isR600toCayman;
+}
+
class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
- InstrItinClass itin>
- : AMDGPUInst <outs, ins, asm, pattern> {
+ InstrItinClass itin = NoItinerary>
+ : AMDGPUInst <outs, ins, asm, pattern>, PredicateControl {
field bits<64> Inst;
bit Trig = 0;
@@ -31,6 +40,7 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
bit IsExport = 0;
bit LDS_1A2D = 0;
+ let SubtargetPredicate = isR600toCayman;
let Namespace = "AMDGPU";
let OutOperandList = outs;
let InOperandList = ins;
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index c5da5e404200..23e646c8147c 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -30,9 +30,9 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -197,7 +197,7 @@ bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
bool R600InstrInfo::usesVertexCache(const MachineInstr &MI) const {
const MachineFunction *MF = MI.getParent()->getParent();
- return !AMDGPU::isCompute(MF->getFunction()->getCallingConv()) &&
+ return !AMDGPU::isCompute(MF->getFunction().getCallingConv()) &&
usesVertexCache(MI.getOpcode());
}
@@ -207,7 +207,7 @@ bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
const MachineFunction *MF = MI.getParent()->getParent();
- return (AMDGPU::isCompute(MF->getFunction()->getCallingConv()) &&
+ return (AMDGPU::isCompute(MF->getFunction().getCallingConv()) &&
usesVertexCache(MI.getOpcode())) ||
usesTextureCache(MI.getOpcode());
}
@@ -1186,10 +1186,8 @@ int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
}
const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
- for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
- LE = MRI.livein_end();
- LI != LE; ++LI) {
- unsigned Reg = LI->first;
+ for (std::pair<unsigned, unsigned> LI : MRI.liveins()) {
+ unsigned Reg = LI.first;
if (TargetRegisterInfo::isVirtualRegister(Reg) ||
!IndirectRC->contains(Reg))
continue;
@@ -1495,3 +1493,21 @@ void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand,
FlagOp.setImm(InstFlags);
}
}
+
+unsigned R600InstrInfo::getAddressSpaceForPseudoSourceKind(
+ PseudoSourceValue::PSVKind Kind) const {
+ switch (Kind) {
+ case PseudoSourceValue::Stack:
+ case PseudoSourceValue::FixedStack:
+ return AMDGPUASI.PRIVATE_ADDRESS;
+ case PseudoSourceValue::ConstantPool:
+ case PseudoSourceValue::GOT:
+ case PseudoSourceValue::JumpTable:
+ case PseudoSourceValue::GlobalValueCallEntry:
+ case PseudoSourceValue::ExternalSymbolCallEntry:
+ case PseudoSourceValue::TargetCustom:
+ return AMDGPUASI.CONSTANT_ADDRESS;
+ }
+ llvm_unreachable("Invalid pseudo source kind");
+ return AMDGPUASI.PRIVATE_ADDRESS;
+}
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index 3b828006807e..abaa37450758 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -318,6 +318,9 @@ public:
bool isRegisterLoad(const MachineInstr &MI) const {
return get(MI.getOpcode()).TSFlags & R600InstrFlags::REGISTER_LOAD;
}
+
+ unsigned getAddressSpaceForPseudoSourceKind(
+ PseudoSourceValue::PSVKind Kind) const override;
};
namespace AMDGPU {
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index bac557ba989e..801e4e61fca6 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -15,6 +15,13 @@
include "R600Intrinsics.td"
include "R600InstrFormats.td"
+// FIXME: Should not be arbitrarily split from other R600 inst classes.
+class R600WrapperInst <dag outs, dag ins, string asm = "", list<dag> pattern = []> :
+ AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+ let SubtargetPredicate = isR600toCayman;
+}
+
+
class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern = []> :
InstR600 <outs, ins, asm, pattern, NullALU> {
@@ -38,9 +45,7 @@ class InstFlag<string PM = "printOperand", int Default = 0>
}
// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers
-def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
- let PrintMethod = "printSel";
-}
+def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))>;
def BANK_SWIZZLE : OperandWithDefaultOps <i32, (ops (i32 0))> {
let PrintMethod = "printBankSwizzle";
}
@@ -348,12 +353,6 @@ def vtx_id2_az_extloadi8 : LoadVtxId2 <az_extloadi8>;
def vtx_id2_az_extloadi16 : LoadVtxId2 <az_extloadi16>;
def vtx_id2_load : LoadVtxId2 <load>;
-def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">;
-
-def isR600toCayman
- : Predicate<
- "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">;
-
//===----------------------------------------------------------------------===//
// R600 SDNodes
//===----------------------------------------------------------------------===//
@@ -395,7 +394,7 @@ def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>;
def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>;
multiclass TexPattern<bits<32> TextureOp, Instruction inst, ValueType vt = v4f32> {
-def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
+def : R600Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
(i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw),
(i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz),
(i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z),
@@ -481,7 +480,7 @@ class ExportBufWord1 {
}
multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
- def : Pat<(R600_EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
+ def : R600Pat<(R600_EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
(i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)),
(ExportInst R600_Reg128:$src, imm:$type, imm:$base,
imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0)
@@ -492,22 +491,22 @@ multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
multiclass SteamOutputExportPattern<Instruction ExportInst,
bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
// Stream0
- def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
+ def : R600Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
(i32 imm:$arraybase), (i32 0), (i32 imm:$mask)),
(ExportInst R600_Reg128:$src, 0, imm:$arraybase,
4095, imm:$mask, buf0inst, 0)>;
// Stream1
- def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
+ def : R600Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
(i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
(ExportInst $src, 0, imm:$arraybase,
4095, imm:$mask, buf1inst, 0)>;
// Stream2
- def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
+ def : R600Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
(i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
(ExportInst $src, 0, imm:$arraybase,
4095, imm:$mask, buf2inst, 0)>;
// Stream3
- def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
+ def : R600Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
(i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
(ExportInst $src, 0, imm:$arraybase,
4095, imm:$mask, buf3inst, 0)>;
@@ -551,7 +550,7 @@ class ExportBufInst : InstR600ISA<(
def KCACHE : InstFlag<"printKCache">;
-class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs),
+class ALU_CLAUSE<bits<4> inst, string OpName> : R600WrapperInst <(outs),
(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1,
KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1,
i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1,
@@ -580,7 +579,7 @@ class CF_WORD0_R600 {
let Word0 = ADDR;
}
-class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
+class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : R600WrapperInst <(outs),
ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 {
field bits<64> Inst;
bits<4> CNT;
@@ -600,7 +599,7 @@ ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 {
let Inst{63-32} = Word1;
}
-class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
+class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : R600WrapperInst <(outs),
ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG {
field bits<64> Inst;
@@ -623,7 +622,7 @@ def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">;
def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">;
def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">;
-def FETCH_CLAUSE : AMDGPUInst <(outs),
+def FETCH_CLAUSE : R600WrapperInst <(outs),
(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > {
field bits<8> Inst;
bits<8> num;
@@ -631,7 +630,7 @@ def FETCH_CLAUSE : AMDGPUInst <(outs),
let isCodeGenOnly = 1;
}
-def ALU_CLAUSE : AMDGPUInst <(outs),
+def ALU_CLAUSE : R600WrapperInst <(outs),
(ins i32imm:$addr), "ALU clause starting at $addr:", [] > {
field bits<8> Inst;
bits<8> num;
@@ -639,7 +638,7 @@ def ALU_CLAUSE : AMDGPUInst <(outs),
let isCodeGenOnly = 1;
}
-def LITERALS : AMDGPUInst <(outs),
+def LITERALS : R600WrapperInst <(outs),
(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > {
let isCodeGenOnly = 1;
@@ -651,16 +650,68 @@ def LITERALS : AMDGPUInst <(outs),
let Inst{63-32} = literal2;
}
-def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > {
+def PAD : R600WrapperInst <(outs), (ins), "PAD", [] > {
field bits<64> Inst;
}
-let Predicates = [isR600toCayman] in {
-
//===----------------------------------------------------------------------===//
// Common Instructions R600, R700, Evergreen, Cayman
//===----------------------------------------------------------------------===//
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+let usesCustomInserter = 1 in {
+
+class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
+ (outs rc:$dst),
+ (ins rc:$src0),
+ "CLAMP $dst, $src0",
+ [(set f32:$dst, (AMDGPUclamp f32:$src0))]
+>;
+
+class FABS <RegisterClass rc> : AMDGPUShaderInst <
+ (outs rc:$dst),
+ (ins rc:$src0),
+ "FABS $dst, $src0",
+ [(set f32:$dst, (fabs f32:$src0))]
+>;
+
+class FNEG <RegisterClass rc> : AMDGPUShaderInst <
+ (outs rc:$dst),
+ (ins rc:$src0),
+ "FNEG $dst, $src0",
+ [(set f32:$dst, (fneg f32:$src0))]
+>;
+
+} // usesCustomInserter = 1
+
+multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
+ ComplexPattern addrPat> {
+let UseNamedOperandTable = 1 in {
+
+ def RegisterLoad : AMDGPUShaderInst <
+ (outs dstClass:$dst),
+ (ins addrClass:$addr, i32imm:$chan),
+ "RegisterLoad $dst, $addr",
+ [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))]
+ > {
+ let isRegisterLoad = 1;
+ }
+
+ def RegisterStore : AMDGPUShaderInst <
+ (outs),
+ (ins dstClass:$val, addrClass:$addr, i32imm:$chan),
+ "RegisterStore $val, $addr",
+ [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))]
+ > {
+ let isRegisterStore = 1;
+ }
+}
+}
+
+} // End isCodeGenOnly = 1, isPseudo = 1
+
+
def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
// Non-IEEE MUL: 0 * anything = 0
def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE">;
@@ -732,7 +783,7 @@ def MOV : R600_1OP <0x19, "MOV", []>;
// Most DUMMY_CHAINs should be eliminated during legalization, but undef
// values can sneak in some to selection.
let isPseudo = 1, isCodeGenOnly = 1 in {
-def DUMMY_CHAIN : AMDGPUInst <
+def DUMMY_CHAIN : R600WrapperInst <
(outs),
(ins),
"DUMMY_CHAIN",
@@ -743,7 +794,7 @@ def DUMMY_CHAIN : AMDGPUInst <
let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
-class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
+class MOV_IMM <ValueType vt, Operand immType> : R600WrapperInst <
(outs R600_Reg32:$dst),
(ins immType:$imm),
"",
@@ -753,20 +804,20 @@ class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
def MOV_IMM_I32 : MOV_IMM<i32, i32imm>;
-def : Pat <
+def : R600Pat <
(imm:$val),
(MOV_IMM_I32 imm:$val)
>;
def MOV_IMM_GLOBAL_ADDR : MOV_IMM<iPTR, i32imm>;
-def : Pat <
+def : R600Pat <
(AMDGPUconstdata_ptr tglobaladdr:$addr),
(MOV_IMM_GLOBAL_ADDR tglobaladdr:$addr)
>;
def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
-def : Pat <
+def : R600Pat <
(fpimm:$val),
(MOV_IMM_F32 fpimm:$val)
>;
@@ -938,7 +989,10 @@ class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
class FMA_Common <bits<5> inst> : R600_3OP <
inst, "FMA",
[(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU
->;
+>
+{
+ let OtherPredicates = [FMA];
+}
class CNDE_Common <bits<5> inst> : R600_3OP <
inst, "CNDE",
@@ -1149,7 +1203,7 @@ def FNEG_R600 : FNEG<R600_Reg32>;
// FIXME: Should be predicated on unsafe fp math.
multiclass DIV_Common <InstR600 recip_ieee> {
-def : Pat<
+def : R600Pat<
(fdiv f32:$src0, f32:$src1),
(MUL_IEEE $src0, (recip_ieee $src1))
>;
@@ -1196,7 +1250,7 @@ let Predicates = [isR600] in {
defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
- def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
+ def : R600Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
def R600_ExportSwz : ExportSwzInst {
@@ -1284,11 +1338,11 @@ defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
// Hardcode channel to 0
// NOTE: LSHR is not available here. LSHR is per family instruction
-def : Pat <
+def : R600Pat <
(i32 (load_private ADDRIndirect:$addr) ),
(R600_RegisterLoad FRAMEri:$addr, (i32 0))
>;
-def : Pat <
+def : R600Pat <
(store_private i32:$val, ADDRIndirect:$addr),
(R600_RegisterStore i32:$val, FRAMEri:$addr, (i32 0))
>;
@@ -1639,7 +1693,7 @@ def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>;
def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>;
class ExtractVerticalPat <Instruction inst, ValueType vec_ty,
- ValueType scalar_ty> : Pat <
+ ValueType scalar_ty> : R600Pat <
(scalar_ty (extractelt vec_ty:$vec, i32:$index)),
(inst $vec, $index)
>;
@@ -1650,7 +1704,7 @@ def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>;
def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>;
class InsertVerticalPat <Instruction inst, ValueType vec_ty,
- ValueType scalar_ty> : Pat <
+ ValueType scalar_ty> : R600Pat <
(vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)),
(inst $vec, $value, $index)
>;
@@ -1664,9 +1718,11 @@ def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
// ISel Patterns
//===----------------------------------------------------------------------===//
+let SubtargetPredicate = isR600toCayman in {
+
// CND*_INT Patterns for f32 True / False values
-class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
+class CND_INT_f32 <InstR600 cnd, CondCode cc> : R600Pat <
(selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc),
(cnd $src0, $src1, $src2)
>;
@@ -1676,18 +1732,18 @@ def : CND_INT_f32 <CNDGT_INT, SETGT>;
def : CND_INT_f32 <CNDGE_INT, SETGE>;
//CNDGE_INT extra pattern
-def : Pat <
+def : R600Pat <
(selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT),
(CNDGE_INT $src0, $src1, $src2)
>;
// KIL Patterns
-def KILP : Pat <
+def KILP : R600Pat <
(int_AMDGPU_kilp),
(MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
>;
-def KIL : Pat <
+def KIL : R600Pat <
(int_AMDGPU_kill f32:$src0),
(MASK_WRITE (KILLGT (f32 ZERO), $src0))
>;
@@ -1736,7 +1792,7 @@ def : BitConvert <v4i32, v4f32, R600_Reg128>;
// DWORDADDR pattern
def : DwordAddrPat <i32, R600_Reg32>;
-} // End isR600toCayman Predicate
+} // End SubtargetPredicate = isR600toCayman
def getLDSNoRetOp : InstrMapping {
let FilterClass = "R600_LDS_1A1D";
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 502dd3bce97e..4a14d95f1cc4 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -1,4 +1,4 @@
-//===--------------------- R600MergeVectorRegisters.cpp -------------------===//
+//===- R600MergeVectorRegisters.cpp ---------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -12,16 +12,16 @@
/// common data and/or have enough undef subreg using swizzle abilities.
///
/// For instance let's consider the following pseudo code :
-/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// %5 = REG_SEQ %1, sub0, %2, sub1, %3, sub2, undef, sub3
/// ...
-/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3
-/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3
+/// %7 = REG_SEQ %1, sub0, %3, sub1, undef, sub2, %4, sub3
+/// (swizzable Inst) %7, SwizzleMask : sub0, sub1, sub2, sub3
///
/// is turned into :
-/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// %5 = REG_SEQ %1, sub0, %2, sub1, %3, sub2, undef, sub3
/// ...
-/// vreg7<def> = INSERT_SUBREG vreg4, sub3
-/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3
+/// %7 = INSERT_SUBREG %4, sub3
+/// (swizzable Inst) %7, SwizzleMask : sub0, sub2, sub1, sub3
///
/// This allow regalloc to reduce register pressure for vector registers and
/// to reduce MOV count.
@@ -44,7 +44,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/DebugLoc.h"
-#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
@@ -98,8 +98,13 @@ public:
class R600VectorRegMerger : public MachineFunctionPass {
private:
+ using InstructionSetMap = DenseMap<unsigned, std::vector<MachineInstr *>>;
+
MachineRegisterInfo *MRI;
- const R600InstrInfo *TII;
+ const R600InstrInfo *TII = nullptr;
+ DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
+ InstructionSetMap PreviousRegSeqByReg;
+ InstructionSetMap PreviousRegSeqByUndefCount;
bool canSwizzle(const MachineInstr &MI) const;
bool areAllUsesSwizzeable(unsigned Reg) const;
@@ -116,16 +121,10 @@ private:
void RemoveMI(MachineInstr *);
void trackRSI(const RegSeqInfo &RSI);
- typedef DenseMap<unsigned, std::vector<MachineInstr *>> InstructionSetMap;
- DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
- InstructionSetMap PreviousRegSeqByReg;
- InstructionSetMap PreviousRegSeqByUndefCount;
-
public:
static char ID;
- R600VectorRegMerger() : MachineFunctionPass(ID),
- TII(nullptr) { }
+ R600VectorRegMerger() : MachineFunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -143,10 +142,17 @@ public:
bool runOnMachineFunction(MachineFunction &Fn) override;
};
-} // end anonymous namespace.
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(R600VectorRegMerger, DEBUG_TYPE,
+ "R600 Vector Reg Merger", false, false)
+INITIALIZE_PASS_END(R600VectorRegMerger, DEBUG_TYPE,
+ "R600 Vector Reg Merger", false, false)
char R600VectorRegMerger::ID = 0;
+char &llvm::R600VectorRegMergerID = R600VectorRegMerger::ID;
+
bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
const {
if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
@@ -330,7 +336,7 @@ void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
}
bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
- if (skipFunction(*Fn.getFunction()))
+ if (skipFunction(Fn.getFunction()))
return false;
const R600Subtarget &ST = Fn.getSubtarget<R600Subtarget>();
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index 1cb40938cee7..7340318d2d88 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -51,7 +51,6 @@ public:
bool runOnMachineFunction(MachineFunction &Fn) override;
};
-char R600Packetizer::ID = 0;
class R600PacketizerList : public VLIWPacketizerList {
private:
@@ -404,6 +403,15 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
} // end anonymous namespace
+INITIALIZE_PASS_BEGIN(R600Packetizer, DEBUG_TYPE,
+ "R600 Packetizer", false, false)
+INITIALIZE_PASS_END(R600Packetizer, DEBUG_TYPE,
+ "R600 Packetizer", false, false)
+
+char R600Packetizer::ID = 0;
+
+char &llvm::R600PacketizerID = R600Packetizer::ID;
+
llvm::FunctionPass *llvm::createR600Packetizer() {
return new R600Packetizer();
}
diff --git a/lib/Target/AMDGPU/R600Processors.td b/lib/Target/AMDGPU/R600Processors.td
new file mode 100644
index 000000000000..89194dc1bdf6
--- /dev/null
+++ b/lib/Target/AMDGPU/R600Processors.td
@@ -0,0 +1,90 @@
+//===-- R600Processors.td - R600 Processor definitions --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Radeon HD 2000/3000 Series (R600).
+//===----------------------------------------------------------------------===//
+
+def : Processor<"r600", R600_VLIW5_Itin,
+ [FeatureR600, FeatureWavefrontSize64, FeatureVertexCache]
+>;
+
+def : Processor<"r630", R600_VLIW5_Itin,
+ [FeatureR600, FeatureWavefrontSize32, FeatureVertexCache]
+>;
+
+def : Processor<"rs880", R600_VLIW5_Itin,
+ [FeatureR600, FeatureWavefrontSize16]
+>;
+
+def : Processor<"rv670", R600_VLIW5_Itin,
+ [FeatureR600, FeatureWavefrontSize64, FeatureVertexCache]
+>;
+
+//===----------------------------------------------------------------------===//
+// Radeon HD 4000 Series (R700).
+//===----------------------------------------------------------------------===//
+
+def : Processor<"rv710", R600_VLIW5_Itin,
+ [FeatureR700, FeatureWavefrontSize32, FeatureVertexCache]
+>;
+
+def : Processor<"rv730", R600_VLIW5_Itin,
+ [FeatureR700, FeatureWavefrontSize32, FeatureVertexCache]
+>;
+
+def : Processor<"rv770", R600_VLIW5_Itin,
+ [FeatureR700, FeatureWavefrontSize64, FeatureVertexCache]
+>;
+
+//===----------------------------------------------------------------------===//
+// Radeon HD 5000 Series (Evergreen).
+//===----------------------------------------------------------------------===//
+
+def : Processor<"cedar", R600_VLIW5_Itin,
+ [FeatureEvergreen, FeatureWavefrontSize32, FeatureVertexCache,
+ FeatureCFALUBug]
+>;
+
+def : Processor<"cypress", R600_VLIW5_Itin,
+ [FeatureEvergreen, FeatureWavefrontSize64, FeatureVertexCache, FeatureFMA]
+>;
+
+def : Processor<"juniper", R600_VLIW5_Itin,
+ [FeatureEvergreen, FeatureWavefrontSize64, FeatureVertexCache]
+>;
+
+def : Processor<"redwood", R600_VLIW5_Itin,
+ [FeatureEvergreen, FeatureWavefrontSize64, FeatureVertexCache,
+ FeatureCFALUBug]
+>;
+
+def : Processor<"sumo", R600_VLIW5_Itin,
+ [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]
+>;
+
+//===----------------------------------------------------------------------===//
+// Radeon HD 6000 Series (Northern Islands).
+//===----------------------------------------------------------------------===//
+
+def : Processor<"barts", R600_VLIW5_Itin,
+ [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]
+>;
+
+def : Processor<"caicos", R600_VLIW5_Itin,
+ [FeatureNorthernIslands, FeatureCFALUBug]
+>;
+
+def : Processor<"cayman", R600_VLIW4_Itin,
+ [FeatureNorthernIslands, FeatureCaymanISA, FeatureFMA]
+>;
+
+def : Processor<"turks", R600_VLIW5_Itin,
+ [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]
+>;
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td
index 3c1e8527284c..84ab328bdb2b 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.td
+++ b/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -147,6 +147,7 @@ def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
def AR_X : R600Reg<"AR.x", 0>;
+def INDIRECT_BASE_ADDR : R600Reg <"INDIRECT_BASE_ADDR", 0>;
def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
(add (sequence "ArrayBase%u", 448, 480))>;
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 8cb35c506135..150d8c3dc3d3 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -1,4 +1,4 @@
-//===-- SIAnnotateControlFlow.cpp - ------------------===//
+//===- SIAnnotateControlFlow.cpp ------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -14,16 +14,32 @@
#include "AMDGPU.h"
#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <cassert>
+#include <utility>
using namespace llvm;
@@ -32,8 +48,8 @@ using namespace llvm;
namespace {
// Complex types used in this pass
-typedef std::pair<BasicBlock *, Value *> StackEntry;
-typedef SmallVector<StackEntry, 16> StackVector;
+using StackEntry = std::pair<BasicBlock *, Value *>;
+using StackVector = SmallVector<StackEntry, 16>;
class SIAnnotateControlFlow : public FunctionPass {
DivergenceAnalysis *DA;
@@ -89,8 +105,7 @@ class SIAnnotateControlFlow : public FunctionPass {
public:
static char ID;
- SIAnnotateControlFlow():
- FunctionPass(ID) { }
+ SIAnnotateControlFlow() : FunctionPass(ID) {}
bool doInitialization(Module &M) override;
@@ -105,7 +120,6 @@ public:
AU.addPreserved<DominatorTreeWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
-
};
} // end anonymous namespace
@@ -186,7 +200,7 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
// \brief Erase "Phi" if it is not used any more
void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
- if (llvm::RecursivelyDeleteDeadPHINode(Phi)) {
+ if (RecursivelyDeleteDeadPHINode(Phi)) {
DEBUG(dbgs() << "Erased unused condition phi\n");
}
}
@@ -215,7 +229,6 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
Value *SIAnnotateControlFlow::handleLoopCondition(
Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term,
SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) {
-
// Only search through PHI nodes which are inside the loop. If we try this
// with PHI nodes that are outside of the loop, we end up inserting new PHI
// nodes outside of the loop which depend on values defined inside the loop.
@@ -223,7 +236,6 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
// 'Instruction does not dominate all users!' errors.
PHINode *Phi = nullptr;
if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) {
-
BasicBlock *Parent = Phi->getParent();
PHINode *NewPhi = PHINode::Create(Int64, 0, "loop.phi", &Parent->front());
Value *Ret = NewPhi;
@@ -333,7 +345,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
- for (WeakTrackingVH Val : reverse(LoopPhiConditions)) {
+ for (WeakTrackingVH Val : llvm::reverse(LoopPhiConditions)) {
if (PHINode *Cond = cast_or_null<PHINode>(Val))
eraseIfUnused(Cond);
}
@@ -360,7 +372,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
Preds.push_back(Pred);
}
- BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
+ BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
}
Value *Exec = popSaved();
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 3915c0e5bdbe..a9f6069e798a 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -67,7 +67,25 @@ enum : uint64_t {
SCALAR_STORE = UINT64_C(1) << 39,
FIXED_SIZE = UINT64_C(1) << 40,
VOPAsmPrefer32Bit = UINT64_C(1) << 41,
- HasFPClamp = UINT64_C(1) << 42
+ VOP3_OPSEL = UINT64_C(1) << 42,
+ maybeAtomic = UINT64_C(1) << 43,
+ renamedInGFX9 = UINT64_C(1) << 44,
+
+ // Is a clamp on FP type.
+ FPClamp = UINT64_C(1) << 45,
+
+ // Is an integer clamp
+ IntClamp = UINT64_C(1) << 46,
+
+ // Clamps lo component of register.
+ ClampLo = UINT64_C(1) << 47,
+
+ // Clamps hi component of register.
+ // ClampLo and ClampHi set for packed clamp.
+ ClampHi = UINT64_C(1) << 48,
+
+ // Is a packed VOP3P instruction.
+ IsPacked = UINT64_C(1) << 49
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -137,7 +155,8 @@ namespace SISrcMods {
SEXT = 1 << 0, // Integer sign-extend modifier
NEG_HI = ABS, // Floating-point negate high packed component modifier.
OP_SEL_0 = 1 << 2,
- OP_SEL_1 = 1 << 3
+ OP_SEL_1 = 1 << 3,
+ DST_OP_SEL = 1 << 3 // VOP3 dst op_sel (share mask with OP_SEL_1)
};
}
@@ -175,8 +194,10 @@ namespace EncValues { // Encoding values of enum9/8/7 operands
enum {
SGPR_MIN = 0,
SGPR_MAX = 101,
- TTMP_MIN = 112,
- TTMP_MAX = 123,
+ TTMP_VI_MIN = 112,
+ TTMP_VI_MAX = 123,
+ TTMP_GFX9_MIN = 108,
+ TTMP_GFX9_MAX = 123,
INLINE_INTEGER_C_MIN = 128,
INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64
INLINE_INTEGER_C_MAX = 208,
@@ -349,6 +370,8 @@ enum SDWA9EncValues{
SRC_VGPR_MAX = 255,
SRC_SGPR_MIN = 256,
SRC_SGPR_MAX = 357,
+ SRC_TTMP_MIN = 364,
+ SRC_TTMP_MAX = 379,
};
} // namespace SDWA
@@ -359,7 +382,9 @@ enum SDWA9EncValues{
#define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8)
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228
+#define R_00B328_SPI_SHADER_PGM_RSRC1_ES 0x00B328
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS 0x00B428
+#define R_00B528_SPI_SHADER_PGM_RSRC1_LS 0x00B528
#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848
#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
diff --git a/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
deleted file mode 100644
index d4d3959658e7..000000000000
--- a/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-//===-- SIFixControlFlowLiveIntervals.cpp - Fix CF live intervals ---------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Spilling of EXEC masks used for control flow messes up control flow
-/// lowering, so mark all live intervals associated with CF instructions as
-/// non-spillable.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "SIInstrInfo.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-fix-cf-live-intervals"
-
-namespace {
-
-class SIFixControlFlowLiveIntervals : public MachineFunctionPass {
-public:
- static char ID;
-
-public:
- SIFixControlFlowLiveIntervals() : MachineFunctionPass(ID) {
- initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override { return "SI Fix CF Live Intervals"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<LiveIntervals>();
- AU.setPreservesAll();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SIFixControlFlowLiveIntervals, DEBUG_TYPE,
- "SI Fix CF Live Intervals", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(SIFixControlFlowLiveIntervals, DEBUG_TYPE,
- "SI Fix CF Live Intervals", false, false)
-
-char SIFixControlFlowLiveIntervals::ID = 0;
-
-char &llvm::SIFixControlFlowLiveIntervalsID = SIFixControlFlowLiveIntervals::ID;
-
-FunctionPass *llvm::createSIFixControlFlowLiveIntervalsPass() {
- return new SIFixControlFlowLiveIntervals();
-}
-
-bool SIFixControlFlowLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
- LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
-
- for (const MachineBasicBlock &MBB : MF) {
- for (const MachineInstr &MI : MBB) {
- switch (MI.getOpcode()) {
- case AMDGPU::SI_IF:
- case AMDGPU::SI_ELSE:
- case AMDGPU::SI_BREAK:
- case AMDGPU::SI_IF_BREAK:
- case AMDGPU::SI_ELSE_BREAK:
- case AMDGPU::SI_END_CF: {
- unsigned Reg = MI.getOperand(0).getReg();
- LIS->getInterval(Reg).markNotSpillable();
- break;
- }
- default:
- break;
- }
- }
- }
-
- return false;
-}
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 0a795c99f94e..8b155c2d2780 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1,4 +1,4 @@
-//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===//
+//===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===//
//
// The LLVM Compiler Infrastructure
//
@@ -14,46 +14,46 @@
/// Register Class <vsrc> is the union of <vgpr> and <sgpr>
///
/// BB0:
-/// %vreg0 <sgpr> = SCALAR_INST
-/// %vreg1 <vsrc> = COPY %vreg0 <sgpr>
+/// %0 <sgpr> = SCALAR_INST
+/// %1 <vsrc> = COPY %0 <sgpr>
/// ...
/// BRANCH %cond BB1, BB2
/// BB1:
-/// %vreg2 <vgpr> = VECTOR_INST
-/// %vreg3 <vsrc> = COPY %vreg2 <vgpr>
+/// %2 <vgpr> = VECTOR_INST
+/// %3 <vsrc> = COPY %2 <vgpr>
/// BB2:
-/// %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1>
-/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc>
+/// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1>
+/// %5 <vgpr> = VECTOR_INST %4 <vsrc>
///
///
/// The coalescer will begin at BB0 and eliminate its copy, then the resulting
/// code will look like this:
///
/// BB0:
-/// %vreg0 <sgpr> = SCALAR_INST
+/// %0 <sgpr> = SCALAR_INST
/// ...
/// BRANCH %cond BB1, BB2
/// BB1:
-/// %vreg2 <vgpr> = VECTOR_INST
-/// %vreg3 <vsrc> = COPY %vreg2 <vgpr>
+/// %2 <vgpr> = VECTOR_INST
+/// %3 <vsrc> = COPY %2 <vgpr>
/// BB2:
-/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1>
-/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
+/// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1>
+/// %5 <vgpr> = VECTOR_INST %4 <sgpr>
///
/// Now that the result of the PHI instruction is an SGPR, the register
-/// allocator is now forced to constrain the register class of %vreg3 to
+/// allocator is now forced to constrain the register class of %3 to
/// <sgpr> so we end up with final code like this:
///
/// BB0:
-/// %vreg0 <sgpr> = SCALAR_INST
+/// %0 <sgpr> = SCALAR_INST
/// ...
/// BRANCH %cond BB1, BB2
/// BB1:
-/// %vreg2 <vgpr> = VECTOR_INST
-/// %vreg3 <sgpr> = COPY %vreg2 <vgpr>
+/// %2 <vgpr> = VECTOR_INST
+/// %3 <sgpr> = COPY %2 <vgpr>
/// BB2:
-/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1>
-/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
+/// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1>
+/// %5 <vgpr> = VECTOR_INST %4 <sgpr>
///
/// Now this code contains an illegal copy from a VGPR to an SGPR.
///
@@ -68,14 +68,34 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <list>
+#include <map>
+#include <tuple>
+#include <utility>
using namespace llvm;
@@ -89,13 +109,17 @@ static cl::opt<bool> EnableM0Merge(
namespace {
class SIFixSGPRCopies : public MachineFunctionPass {
-
MachineDominatorTree *MDT;
-
+ MachinePostDominatorTree *MPDT;
+ DenseMap<MachineBasicBlock *, SetVector<MachineBasicBlock*>> PDF;
+ void computePDF(MachineFunction * MF);
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void printPDF();
+#endif
public:
static char ID;
- SIFixSGPRCopies() : MachineFunctionPass(ID) { }
+ SIFixSGPRCopies() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -104,12 +128,14 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachinePostDominatorTree>();
+ AU.addPreserved<MachinePostDominatorTree>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
-} // End anonymous namespace
+} // end anonymous namespace
INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
@@ -117,7 +143,6 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
-
char SIFixSGPRCopies::ID = 0;
char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID;
@@ -287,7 +312,6 @@ static bool phiHasVGPROperands(const MachineInstr &PHI,
const MachineRegisterInfo &MRI,
const SIRegisterInfo *TRI,
const SIInstrInfo *TII) {
-
for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
unsigned Reg = PHI.getOperand(i).getReg();
if (TRI->hasVGPRs(MRI.getRegClass(Reg)))
@@ -295,10 +319,10 @@ static bool phiHasVGPROperands(const MachineInstr &PHI,
}
return false;
}
+
static bool phiHasBreakDef(const MachineInstr &PHI,
const MachineRegisterInfo &MRI,
SmallSet<unsigned, 8> &Visited) {
-
for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
unsigned Reg = PHI.getOperand(i).getReg();
if (Visited.count(Reg))
@@ -337,6 +361,8 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
const SIInstrInfo *TII,
unsigned &SMovOp,
int64_t &Imm) {
+ if (Copy->getOpcode() != AMDGPU::COPY)
+ return false;
if (!MoveImm->isMoveImmediate())
return false;
@@ -368,13 +394,12 @@ template <class UnaryPredicate>
bool searchPredecessors(const MachineBasicBlock *MBB,
const MachineBasicBlock *CutOff,
UnaryPredicate Predicate) {
-
if (MBB == CutOff)
return false;
- DenseSet<const MachineBasicBlock*> Visited;
- SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(),
- MBB->pred_end());
+ DenseSet<const MachineBasicBlock *> Visited;
+ SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(),
+ MBB->pred_end());
while (!Worklist.empty()) {
MachineBasicBlock *MBB = Worklist.pop_back_val();
@@ -392,12 +417,6 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
return false;
}
-static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
- const TargetRegisterInfo *TRI) {
- return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
- return hasTerminatorThatModifiesExec(*MBB, *TRI); });
-}
-
// Checks if there is potential path From instruction To instruction.
// If CutOff is specified and it sits in between of that path we ignore
// a higher portion of the path and report it is not reachable.
@@ -430,7 +449,7 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
const MachineRegisterInfo &MRI,
MachineDominatorTree &MDT) {
// List of inits by immediate value.
- typedef std::map<unsigned, std::list<MachineInstr*>> InitListMap;
+ using InitListMap = std::map<unsigned, std::list<MachineInstr *>>;
InitListMap Inits;
// List of clobbering instructions.
SmallVector<MachineInstr*, 8> Clobbers;
@@ -487,16 +506,18 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
MDT.properlyDominates(Clobber->getParent(), MBBTo));
};
- return (any_of(Clobbers, interferes)) ||
- (any_of(Inits, [&](InitListMap::value_type &C) {
- return C.first != Init.first && any_of(C.second, interferes);
+ return (llvm::any_of(Clobbers, interferes)) ||
+ (llvm::any_of(Inits, [&](InitListMap::value_type &C) {
+ return C.first != Init.first &&
+ llvm::any_of(C.second, interferes);
}));
};
if (MDT.dominates(MI1, MI2)) {
if (!intereferes(MI2, MI1)) {
- DEBUG(dbgs() << "Erasing from BB#" << MI2->getParent()->getNumber()
- << " " << *MI2);
+ DEBUG(dbgs() << "Erasing from "
+ << printMBBReference(*MI2->getParent()) << " "
+ << *MI2);
MI2->eraseFromParent();
Defs.erase(I2++);
Changed = true;
@@ -504,8 +525,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
}
} else if (MDT.dominates(MI2, MI1)) {
if (!intereferes(MI1, MI2)) {
- DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
- << " " << *MI1);
+ DEBUG(dbgs() << "Erasing from "
+ << printMBBReference(*MI1->getParent()) << " "
+ << *MI1);
MI1->eraseFromParent();
Defs.erase(I1++);
Changed = true;
@@ -521,10 +543,11 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
if (!intereferes(MI1, I) && !intereferes(MI2, I)) {
- DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
- << " " << *MI1 << "and moving from BB#"
- << MI2->getParent()->getNumber() << " to BB#"
- << I->getParent()->getNumber() << " " << *MI2);
+ DEBUG(dbgs() << "Erasing from "
+ << printMBBReference(*MI1->getParent()) << " " << *MI1
+ << "and moving from "
+ << printMBBReference(*MI2->getParent()) << " to "
+ << printMBBReference(*I->getParent()) << " " << *MI2);
I->getParent()->splice(I, MI2->getParent(), MI2);
MI1->eraseFromParent();
Defs.erase(I1++);
@@ -544,18 +567,52 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
return Changed;
}
+void SIFixSGPRCopies::computePDF(MachineFunction *MF) {
+ MachineFunction::iterator B = MF->begin();
+ MachineFunction::iterator E = MF->end();
+ for (; B != E; ++B) {
+ if (B->succ_size() > 1) {
+ for (auto S : B->successors()) {
+ MachineDomTreeNode *runner = MPDT->getNode(&*S);
+ MachineDomTreeNode *sentinel = MPDT->getNode(&*B)->getIDom();
+ while (runner && runner != sentinel) {
+ PDF[runner->getBlock()].insert(&*B);
+ runner = runner->getIDom();
+ }
+ }
+ }
+ }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void SIFixSGPRCopies::printPDF() {
+ dbgs() << "\n######## PostDominanceFrontiers set #########\n";
+ for (auto &I : PDF) {
+ dbgs() << "PDF[ " << I.first->getNumber() << "] : ";
+ for (auto &J : I.second) {
+ dbgs() << J->getNumber() << ' ';
+ }
+ dbgs() << '\n';
+ }
+ dbgs() << "\n##############################################\n";
+}
+#endif
+
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
MDT = &getAnalysis<MachineDominatorTree>();
+ MPDT = &getAnalysis<MachinePostDominatorTree>();
+ PDF.clear();
+ computePDF(&MF);
+ DEBUG(printPDF());
SmallVector<MachineInstr *, 16> Worklist;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
-
MachineBasicBlock &MBB = *BI;
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {
@@ -564,7 +621,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
switch (MI.getOpcode()) {
default:
continue;
- case AMDGPU::COPY: {
+ case AMDGPU::COPY:
+ case AMDGPU::WQM:
+ case AMDGPU::WWM: {
// If the destination register is a physical register there isn't really
// much we can do to fix this.
if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
@@ -602,14 +661,27 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
break;
- // We don't need to fix the PHI if the common dominator of the
- // two incoming blocks terminates with a uniform branch.
- if (MI.getNumExplicitOperands() == 5) {
- MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
- MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
-
- if (!predsHasDivergentTerminator(MBB0, TRI) &&
- !predsHasDivergentTerminator(MBB1, TRI)) {
+ // We don't need to fix the PHI if all the source blocks
+ // have no divergent control dependecies
+ bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
+ if (!HasVGPROperand) {
+ bool Uniform = true;
+ MachineBasicBlock * Join = MI.getParent();
+ for (auto &O : MI.explicit_operands()) {
+ if (O.isMBB()) {
+ MachineBasicBlock * Source = O.getMBB();
+ SetVector<MachineBasicBlock*> &SourcePDF = PDF[Source];
+ SetVector<MachineBasicBlock*> &JoinPDF = PDF[Join];
+ SetVector<MachineBasicBlock*> CDList;
+ for (auto &I : SourcePDF) {
+ if (!JoinPDF.count(I) || /* back edge */MDT->dominates(Join, I)) {
+ if (hasTerminatorThatModifiesExec(*I, *TRI))
+ Uniform = false;
+ }
+ }
+ }
+ }
+ if (Uniform) {
DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
break;
}
@@ -649,14 +721,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
// is no chance for values to be over-written.
SmallSet<unsigned, 8> Visited;
- if (phiHasVGPROperands(MI, MRI, TRI, TII) ||
- !phiHasBreakDef(MI, MRI, Visited)) {
+ if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
DEBUG(dbgs() << "Fixing PHI: " << MI);
TII->moveToVALU(MI);
}
break;
}
- case AMDGPU::REG_SEQUENCE: {
+ case AMDGPU::REG_SEQUENCE:
if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
!hasVGPROperands(MI, TRI)) {
foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
@@ -667,7 +738,6 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
TII->moveToVALU(MI);
break;
- }
case AMDGPU::INSERT_SUBREG: {
const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
new file mode 100644
index 000000000000..3493c7775f0c
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
@@ -0,0 +1,202 @@
+//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Computations in WWM can overwrite values in inactive channels for
+/// variables that the register allocator thinks are dead. This pass adds fake
+/// uses of those variables to WWM instructions to make sure that they aren't
+/// overwritten.
+///
+/// As an example, consider this snippet:
+/// %vgpr0 = V_MOV_B32_e32 0.0
+/// if (...) {
+/// %vgpr1 = ...
+/// %vgpr2 = WWM killed %vgpr1
+/// ... = killed %vgpr2
+/// %vgpr0 = V_MOV_B32_e32 1.0
+/// }
+/// ... = %vgpr0
+///
+/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally,
+/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since
+/// writing %vgpr1 would only write to channels that would be clobbered by the
+/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
+/// it would clobber even the inactive channels for which the if-condition is
+/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
+/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the
+/// same register.
+///
+/// In general, we need to figure out what registers might have their inactive
+/// channels which are eventually used accidentally clobbered by a WWM
+/// instruction. We approximate this using two conditions:
+///
+/// 1. A definition of the variable reaches the WWM instruction.
+/// 2. The variable would be live at the WWM instruction if all its defs were
+/// partial defs (i.e. considered as a use), ignoring normal uses.
+///
+/// If a register matches both conditions, then we add an implicit use of it to
+/// the WWM instruction. Condition #2 is the heart of the matter: every
+/// definition is really a partial definition, since every VALU instruction is
+/// implicitly predicated. We can usually ignore this, but WWM forces us not
+/// to. Condition #1 prevents false positives if the variable is undefined at
+/// the WWM instruction anyways. This is overly conservative in certain cases,
+/// especially in uniform control flow, but this is a workaround anyways until
+/// LLVM gains the notion of predicated uses and definitions of variables.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-wwm-liveness"
+
+namespace {
+
+class SIFixWWMLiveness : public MachineFunctionPass {
+private:
+ LiveIntervals *LIS = nullptr;
+ const SIRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+
+public:
+ static char ID;
+
+ SIFixWWMLiveness() : MachineFunctionPass(ID) {
+ initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ bool runOnWWMInstruction(MachineInstr &MI);
+
+ void addDefs(const MachineInstr &MI, SparseBitVector<> &set);
+
+ StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // Should preserve the same set that TwoAddressInstructions does.
+ AU.addPreserved<SlotIndexes>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreservedID(LiveVariablesID);
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE,
+ "SI fix WWM liveness", false, false)
+
+char SIFixWWMLiveness::ID = 0;
+
+char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID;
+
+FunctionPass *llvm::createSIFixWWMLivenessPass() {
+ return new SIFixWWMLiveness();
+}
+
+void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs)
+{
+ for (const MachineOperand &Op : MI.defs()) {
+ if (Op.isReg()) {
+ unsigned Reg = Op.getReg();
+ if (TRI->isVGPR(*MRI, Reg))
+ Regs.set(Reg);
+ }
+ }
+}
+
+bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) {
+ MachineBasicBlock *MBB = WWM.getParent();
+
+ // Compute the registers that are live out of MI by figuring out which defs
+ // are reachable from MI.
+ SparseBitVector<> LiveOut;
+
+ for (auto II = MachineBasicBlock::iterator(WWM), IE =
+ MBB->end(); II != IE; ++II) {
+ addDefs(*II, LiveOut);
+ }
+
+ for (df_iterator<MachineBasicBlock *> I = ++df_begin(MBB),
+ E = df_end(MBB);
+ I != E; ++I) {
+ for (const MachineInstr &MI : **I) {
+ addDefs(MI, LiveOut);
+ }
+ }
+
+ // Compute the registers that reach MI.
+ SparseBitVector<> Reachable;
+
+ for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE =
+ MBB->rend(); II != IE; ++II) {
+ addDefs(*II, Reachable);
+ }
+
+ for (idf_iterator<MachineBasicBlock *> I = ++idf_begin(MBB),
+ E = idf_end(MBB);
+ I != E; ++I) {
+ for (const MachineInstr &MI : **I) {
+ addDefs(MI, Reachable);
+ }
+ }
+
+ // find the intersection, and add implicit uses.
+ LiveOut &= Reachable;
+
+ bool Modified = false;
+ for (unsigned Reg : LiveOut) {
+ WWM.addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
+ if (LIS) {
+ // FIXME: is there a better way to update the live interval?
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+ Modified = true;
+ }
+
+ return Modified;
+}
+
+bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
+ bool Modified = false;
+
+ // This doesn't actually need LiveIntervals, but we can preserve them.
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ TRI = &TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
+ Modified |= runOnWWMInstruction(MI);
+ }
+ }
+ }
+
+ return Modified;
+}
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0aad8f0843d6..783181980342 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -14,7 +14,7 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -290,11 +290,11 @@ void SIFoldOperands::foldOperand(
// copy since a subregister use tied to a full register def doesn't really
// make sense. e.g. don't fold:
//
- // %vreg1 = COPY %vreg0:sub1
- // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1<tied0>
+ // %1 = COPY %0:sub1
+ // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
//
// into
- // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1<tied0>
+ // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
return;
}
@@ -628,7 +628,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
MachineOperand *NonInlineUse = nullptr;
int NonInlineUseOpNo = -1;
- MachineRegisterInfo::use_iterator NextUse, NextInstUse;
+ MachineRegisterInfo::use_iterator NextUse;
for (MachineRegisterInfo::use_iterator
Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
Use != E; Use = NextUse) {
@@ -723,12 +723,15 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
}
}
+// Clamp patterns are canonically selected to v_max_* instructions, so only
+// handle them.
const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
unsigned Op = MI.getOpcode();
switch (Op) {
case AMDGPU::V_MAX_F32_e64:
case AMDGPU::V_MAX_F16_e64:
- case AMDGPU::V_MAX_F64: {
+ case AMDGPU::V_MAX_F64:
+ case AMDGPU::V_PK_MAX_F16: {
if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
return nullptr;
@@ -736,14 +739,24 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (!Src0->isReg() || !Src1->isReg() ||
+ Src0->getReg() != Src1->getReg() ||
Src0->getSubReg() != Src1->getSubReg() ||
Src0->getSubReg() != AMDGPU::NoSubRegister)
return nullptr;
// Can't fold up if we have modifiers.
- if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
- TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
- TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+ if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+ return nullptr;
+
+ unsigned Src0Mods
+ = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
+ unsigned Src1Mods
+ = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
+
+ // Having a 0 op_sel_hi would require swizzling the output in the source
+ // instruction, which we can't do.
+ unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0;
+ if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
return nullptr;
return Src0;
}
@@ -765,14 +778,18 @@ static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
return true;
}
+// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
const MachineOperand *ClampSrc = isClamp(MI);
if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
return false;
MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
- if (!TII->hasFPClamp(*Def))
+
+ // The type of clamp must be compatible.
+ if (TII->getClampMask(*Def) != TII->getClampMask(MI))
return false;
+
MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
if (!DefClamp)
return false;
@@ -909,7 +926,7 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
}
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
- if (skipFunction(*MF.getFunction()))
+ if (skipFunction(MF.getFunction()))
return false;
MRI = &MF.getRegInfo();
@@ -954,9 +971,9 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
// Prevent folding operands backwards in the function. For example,
// the COPY opcode must not be replaced by 1 in this example:
//
- // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3
+ // %3 = COPY %vgpr0; VGPR_32:%3
// ...
- // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use>
+ // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
MachineOperand &Dst = MI.getOperand(0);
if (Dst.isReg() &&
!TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 7334781916d8..89bb98dbd028 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -38,6 +38,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
MachineBasicBlock &MBB) const {
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo* TRI = &TII->getRegisterInfo();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// We don't need this if we only have spills since there is no user facing
// scratch.
@@ -55,7 +56,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
MachineBasicBlock::iterator I = MBB.begin();
unsigned FlatScratchInitReg
- = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
+ = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.addLiveIn(FlatScratchInitReg);
@@ -64,7 +65,6 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
// Do a 64-bit pointer add.
@@ -219,7 +219,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
// specified.
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- auto AMDGPUASI = ST.getAMDGPUAS();
if (ST.debuggerEmitPrologue())
emitDebuggerPrologue(MF, MBB);
@@ -283,13 +282,13 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
}
// We need to insert initialization of the scratch resource descriptor.
- unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
if (ST.isAmdCodeObjectV2(MF)) {
- PreloadedPrivateBufferReg = TRI->getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ PreloadedPrivateBufferReg = MFI->getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
}
bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg);
@@ -356,7 +355,64 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
.addReg(PreloadedPrivateBufferReg, RegState::Kill);
}
- if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) {
+ if (ResourceRegUsed)
+ emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
+ PreloadedPrivateBufferReg, ScratchRsrcReg);
+}
+
+// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
+void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
+ MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
+ MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
+ unsigned ScratchRsrcReg) const {
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ DebugLoc DL;
+
+ if (ST.isAmdPalOS()) {
+ // The pointer to the GIT is formed from the offset passed in and either
+ // the amdgpu-git-ptr-high function attribute or the top part of the PC
+ unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+ unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+
+ const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+
+ if (MFI->getGITPtrHigh() != 0xffffffff) {
+ BuildMI(MBB, I, DL, SMovB32, RsrcHi)
+ .addImm(MFI->getGITPtrHigh())
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ } else {
+ const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
+ BuildMI(MBB, I, DL, GetPC64, Rsrc01);
+ }
+ BuildMI(MBB, I, DL, SMovB32, RsrcLo)
+ .addReg(AMDGPU::SGPR0) // Low address passed in
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ // We now have the GIT ptr - now get the scratch descriptor from the entry
+ // at offset 0.
+ PointerType *PtrTy =
+ PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
+ AMDGPUAS::CONSTANT_ADDRESS);
+ MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+ const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
+ auto MMO = MF.getMachineMemOperand(PtrInfo,
+ MachineMemOperand::MOLoad |
+ MachineMemOperand::MOInvariant |
+ MachineMemOperand::MODereferenceable,
+ 0, 0);
+ BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
+ .addReg(Rsrc01)
+ .addImm(0) // offset
+ .addImm(0) // glc
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
+ .addMemOperand(MMO);
+ return;
+ }
+ if (ST.isMesaGfxShader(MF)
+ || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
assert(!ST.isAmdCodeObjectV2(MF));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
@@ -369,7 +425,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
if (MFI->hasImplicitBufferPtr()) {
unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
- if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
+ if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
BuildMI(MBB, I, DL, Mov64, Rsrc01)
@@ -379,8 +435,8 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
PointerType *PtrTy =
- PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()),
- AMDGPUASI.CONSTANT_ADDRESS);
+ PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
+ AMDGPUAS::CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
auto MMO = MF.getMachineMemOperand(PtrInfo,
MachineMemOperand::MOLoad |
@@ -454,6 +510,15 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.addImm(NumBytes * ST.getWavefrontSize())
.setMIFlag(MachineInstr::FrameSetup);
}
+
+ for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+ : FuncInfo->getSGPRSpillVGPRs()) {
+ if (!Reg.FI.hasValue())
+ continue;
+ TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
+ Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
+ &TII->getRegisterInfo());
+ }
}
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -462,6 +527,19 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
if (FuncInfo->isEntryFunction())
return;
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+
+ for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+ : FuncInfo->getSGPRSpillVGPRs()) {
+ if (!Reg.FI.hasValue())
+ continue;
+ TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
+ Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
+ &TII->getRegisterInfo());
+ }
+
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
if (StackPtrReg == AMDGPU::NoRegister)
return;
@@ -469,9 +547,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
const MachineFrameInfo &MFI = MF.getFrameInfo();
uint32_t NumBytes = MFI.getStackSize();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
DebugLoc DL;
// FIXME: Clarify distinction between no set SP and SP. For callee functions,
@@ -575,6 +650,50 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
}
}
+void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ // The SP is specifically managed and we don't want extra spills of it.
+ SavedRegs.reset(MFI->getStackPtrOffsetReg());
+}
+
+MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ int64_t Amount = I->getOperand(0).getImm();
+ if (Amount == 0)
+ return MBB.erase(I);
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const DebugLoc &DL = I->getDebugLoc();
+ unsigned Opc = I->getOpcode();
+ bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+ uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ if (!TFI->hasReservedCallFrame(MF)) {
+ unsigned Align = getStackAlignment();
+
+ Amount = alignTo(Amount, Align);
+ assert(isUInt<32>(Amount) && "exceeded stack address space size");
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned SPReg = MFI->getStackPtrOffsetReg();
+
+ unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
+ BuildMI(MBB, I, DL, TII->get(Op), SPReg)
+ .addReg(SPReg)
+ .addImm(Amount * ST.getWavefrontSize());
+ } else if (CalleePopAmount != 0) {
+ llvm_unreachable("is this used?");
+ }
+
+ return MBB.erase(I);
+}
+
void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index d4dfa1c7eaa8..df6f1632a316 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -35,10 +35,18 @@ public:
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS = nullptr) const override;
+
void processFunctionBeforeFrameFinalized(
MachineFunction &MF,
RegScavenger *RS = nullptr) const override;
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+
private:
void emitFlatScratchInit(const SISubtarget &ST,
MachineFunction &MF,
@@ -61,6 +69,12 @@ private:
/// \brief Emits debugger prologue.
void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+ // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
+ void emitEntryFunctionScratchSetup(const SISubtarget &ST, MachineFunction &MF,
+ MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
+ MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
+ unsigned ScratchRsrcReg) const;
+
public:
bool hasFP(const MachineFunction &MF) const override;
bool hasSP(const MachineFunction &MF) const;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 2356405f0919..50ee88fa635a 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -32,6 +32,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
@@ -45,11 +46,14 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetCallingConv.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
@@ -70,9 +74,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetCallingConv.h"
#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
#include <cassert>
#include <cmath>
#include <cstdint>
@@ -83,11 +85,21 @@
using namespace llvm;
+#define DEBUG_TYPE "si-lower"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
static cl::opt<bool> EnableVGPRIndexMode(
"amdgpu-vgpr-index-mode",
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
cl::init(false));
+static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
+ "amdgpu-frame-index-zero-bits",
+ cl::desc("High bits of frame index assumed to be zero"),
+ cl::init(5),
+ cl::ReallyHidden);
+
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
@@ -214,6 +226,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
+#if 0
+ setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
+ setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
+#endif
+
+ //setOperationAction(ISD::ADDC, MVT::i64, Expand);
+ //setOperationAction(ISD::SUBC, MVT::i64, Expand);
+
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
@@ -462,6 +482,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
@@ -496,6 +517,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::BUILD_VECTOR);
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
@@ -528,8 +550,7 @@ const SISubtarget *SITargetLowering::getSubtarget() const {
// TargetLowering queries
//===----------------------------------------------------------------------===//
-bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
- EVT) const {
+bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
// SI has some legal vector types, but no legal vector operations. Say no
// shuffles are legal in order to prefer scalarizing some vector operations.
return false;
@@ -537,6 +558,7 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
+ MachineFunction &MF,
unsigned IntrID) const {
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
@@ -545,11 +567,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align = 0;
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
- Info.vol = !Vol || !Vol->isZero();
- Info.readMem = true;
- Info.writeMem = true;
+ if (!Vol || !Vol->isZero())
+ Info.flags |= MachineMemOperand::MOVolatile;
+
return true;
}
default:
@@ -587,6 +610,26 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
}
+bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
+ if (Subtarget->hasFlatGlobalInsts())
+ return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
+
+ if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
+ // Assume the we will use FLAT for all global memory accesses
+ // on VI.
+ // FIXME: This assumption is currently wrong. On VI we still use
+ // MUBUF instructions for the r + i addressing mode. As currently
+ // implemented, the MUBUF instructions only work on buffer < 4GB.
+ // It may be possible to support > 4GB buffers with MUBUF instructions,
+ // by setting the stride value in the resource descriptor which would
+ // increase the size limit to (stride * 4GB). However, this is risky,
+ // because it has never been validated.
+ return isLegalFlatAddressingMode(AM);
+ }
+
+ return isLegalMUBUFAddressingMode(AM);
+}
+
bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
// additionally can do r + r + i with addr64. 32-bit has more addressing
@@ -624,27 +667,15 @@ bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
- unsigned AS) const {
+ unsigned AS, Instruction *I) const {
// No global is ever allowed as a base.
if (AM.BaseGV)
return false;
- if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
- if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
- // Assume the we will use FLAT for all global memory accesses
- // on VI.
- // FIXME: This assumption is currently wrong. On VI we still use
- // MUBUF instructions for the r + i addressing mode. As currently
- // implemented, the MUBUF instructions only work on buffer < 4GB.
- // It may be possible to support > 4GB buffers with MUBUF instructions,
- // by setting the stride value in the resource descriptor which would
- // increase the size limit to (stride * 4GB). However, this is risky,
- // because it has never been validated.
- return isLegalFlatAddressingMode(AM);
- }
+ if (AS == AMDGPUASI.GLOBAL_ADDRESS)
+ return isLegalGlobalAddressingMode(AM);
- return isLegalMUBUFAddressingMode(AM);
- } else if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@@ -656,7 +687,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// FIXME?: We also need to do this if unaligned, but we don't know the
// alignment here.
if (DL.getTypeStoreSize(Ty) < 4)
- return isLegalMUBUFAddressingMode(AM);
+ return isLegalGlobalAddressingMode(AM);
if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
// SMRD instructions have an 8-bit, dword offset on SI.
@@ -888,18 +919,30 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
uint64_t Offset) const {
const DataLayout &DL = DAG.getDataLayout();
MachineFunction &MF = DAG.getMachineFunction();
- const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
- unsigned InputPtrReg = TRI->getPreloadedValue(MF,
- SIRegisterInfo::KERNARG_SEGMENT_PTR);
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ const ArgDescriptor *InputPtrReg;
+ const TargetRegisterClass *RC;
+
+ std::tie(InputPtrReg, RC)
+ = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
- MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
+ MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
+
return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
DAG.getConstant(Offset, SL, PtrVT));
}
+SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
+ const SDLoc &SL) const {
+ auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
+ uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+ return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
+}
+
SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Val,
bool Signed,
@@ -991,6 +1034,17 @@ SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA
return ArgValue;
}
+SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
+ const SIMachineFunctionInfo &MFI,
+ EVT VT,
+ AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
+ const ArgDescriptor *Reg;
+ const TargetRegisterClass *RC;
+
+ std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
+ return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
+}
+
static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
CallingConv::ID CallConv,
ArrayRef<ISD::InputArg> Ins,
@@ -1041,29 +1095,131 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
}
// Allocate special inputs passed in VGPRs.
-static void allocateSpecialInputVGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
+static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) {
if (Info.hasWorkItemIDX()) {
- unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+ unsigned Reg = AMDGPU::VGPR0;
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+
CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDY()) {
- unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+ unsigned Reg = AMDGPU::VGPR1;
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+
CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDZ()) {
- unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+ unsigned Reg = AMDGPU::VGPR2;
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+
CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
}
}
+// Try to allocate a VGPR at the end of the argument list, or if no argument
+// VGPRs are left allocating a stack slot.
+static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
+ ArrayRef<MCPhysReg> ArgVGPRs
+ = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
+ unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
+ if (RegIdx == ArgVGPRs.size()) {
+ // Spill to stack required.
+ int64_t Offset = CCInfo.AllocateStack(4, 4);
+
+ return ArgDescriptor::createStack(Offset);
+ }
+
+ unsigned Reg = ArgVGPRs[RegIdx];
+ Reg = CCInfo.AllocateReg(Reg);
+ assert(Reg != AMDGPU::NoRegister);
+
+ MachineFunction &MF = CCInfo.getMachineFunction();
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ return ArgDescriptor::createRegister(Reg);
+}
+
+static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
+ const TargetRegisterClass *RC,
+ unsigned NumArgRegs) {
+ ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
+ unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
+ if (RegIdx == ArgSGPRs.size())
+ report_fatal_error("ran out of SGPRs for arguments");
+
+ unsigned Reg = ArgSGPRs[RegIdx];
+ Reg = CCInfo.AllocateReg(Reg);
+ assert(Reg != AMDGPU::NoRegister);
+
+ MachineFunction &MF = CCInfo.getMachineFunction();
+ MF.addLiveIn(Reg, RC);
+ return ArgDescriptor::createRegister(Reg);
+}
+
+static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
+ return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
+}
+
+static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
+ return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
+}
+
+static void allocateSpecialInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) {
+ if (Info.hasWorkItemIDX())
+ Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
+
+ if (Info.hasWorkItemIDY())
+ Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
+
+ if (Info.hasWorkItemIDZ())
+ Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
+}
+
+static void allocateSpecialInputSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) {
+ auto &ArgInfo = Info.getArgInfo();
+
+ // TODO: Unify handling with private memory pointers.
+
+ if (Info.hasDispatchPtr())
+ ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
+
+ if (Info.hasQueuePtr())
+ ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
+
+ if (Info.hasKernargSegmentPtr())
+ ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
+
+ if (Info.hasDispatchID())
+ ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
+
+ // flat_scratch_init is not applicable for non-kernel functions.
+
+ if (Info.hasWorkGroupIDX())
+ ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
+
+ if (Info.hasWorkGroupIDY())
+ ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
+
+ if (Info.hasWorkGroupIDZ())
+ ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
+
+ if (Info.hasImplicitArgPtr())
+ ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
+}
+
// Allocate special inputs passed in user SGPRs.
static void allocateHSAUserSGPRs(CCState &CCInfo,
MachineFunction &MF,
@@ -1187,20 +1343,38 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
if (TM.getOptLevel() == CodeGenOpt::None)
HasStackObjects = true;
+ // For now assume stack access is needed in any callee functions, so we need
+ // the scratch registers to pass in.
+ bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
+
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
if (ST.isAmdCodeObjectV2(MF)) {
- if (HasStackObjects) {
+ if (RequiresStackAccess) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
// SGPR inputs. We can reserve those and use them directly.
- unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
- unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
- Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ if (MFI.hasCalls()) {
+ // If we have calls, we need to keep the frame register in a register
+ // that won't be clobbered by a call, so ensure it is copied somewhere.
+
+ // This is not a problem for the scratch wave offset, because the same
+ // registers are reserved in all functions.
+
+ // FIXME: Nothing is really ensuring this is a call preserved register,
+ // it's just selected from the end so it happens to be.
+ unsigned ReservedOffsetReg
+ = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
+ Info.setScratchWaveOffsetReg(ReservedOffsetReg);
+ } else {
+ unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ }
} else {
unsigned ReservedBufferReg
= TRI.reservedPrivateSegmentBufferReg(MF);
@@ -1223,9 +1397,9 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// offset is still in an input SGPR.
Info.setScratchRSrcReg(ReservedBufferReg);
- if (HasStackObjects) {
- unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ if (HasStackObjects && !MFI.hasCalls()) {
+ unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
} else {
unsigned ReservedOffsetReg
@@ -1235,6 +1409,50 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
}
}
+bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+ return !Info->isEntryFunction();
+}
+
+void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+
+}
+
+void SITargetLowering::insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+
+ const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+ if (!IStart)
+ return;
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+ MachineBasicBlock::iterator MBBI = Entry->begin();
+ for (const MCPhysReg *I = IStart; *I; ++I) {
+ const TargetRegisterClass *RC = nullptr;
+ if (AMDGPU::SReg_64RegClass.contains(*I))
+ RC = &AMDGPU::SGPR_64RegClass;
+ else if (AMDGPU::SReg_32RegClass.contains(*I))
+ RC = &AMDGPU::SGPR_32RegClass;
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+ unsigned NewVR = MRI->createVirtualRegister(RC);
+ // Create copy from CSR to a virtual register.
+ Entry->addLiveIn(*I);
+ BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+ .addReg(*I);
+
+ // Insert the copy-back instructions right before the terminator.
+ for (auto *Exit : Exits)
+ BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+ TII->get(TargetOpcode::COPY), *I)
+ .addReg(NewVR);
+ }
+}
+
SDValue SITargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -1242,14 +1460,14 @@ SDValue SITargetLowering::LowerFormalArguments(
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
MachineFunction &MF = DAG.getMachineFunction();
- FunctionType *FType = MF.getFunction()->getFunctionType();
+ FunctionType *FType = MF.getFunction().getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
- const Function *Fn = MF.getFunction();
+ const Function &Fn = MF.getFunction();
DiagnosticInfoUnsupported NoGraphicsHSA(
- *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
+ Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
DAG.getContext()->diagnose(NoGraphicsHSA);
return DAG.getEntryNode();
}
@@ -1269,6 +1487,12 @@ SDValue SITargetLowering::LowerFormalArguments(
bool IsKernel = AMDGPU::isKernel(CallConv);
bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
+ if (!IsEntryFunc) {
+ // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
+ // this when allocating argument fixed offsets.
+ CCInfo.AllocateStack(4, 4);
+ }
+
if (IsShader) {
processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
@@ -1285,14 +1509,31 @@ SDValue SITargetLowering::LowerFormalArguments(
// - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
// enabled too.
- if (CallConv == CallingConv::AMDGPU_PS &&
- ((Info->getPSInputAddr() & 0x7F) == 0 ||
- ((Info->getPSInputAddr() & 0xF) == 0 &&
- Info->isPSInputAllocated(11)))) {
- CCInfo.AllocateReg(AMDGPU::VGPR0);
- CCInfo.AllocateReg(AMDGPU::VGPR1);
- Info->markPSInputAllocated(0);
- Info->markPSInputEnabled(0);
+ if (CallConv == CallingConv::AMDGPU_PS) {
+ if ((Info->getPSInputAddr() & 0x7F) == 0 ||
+ ((Info->getPSInputAddr() & 0xF) == 0 &&
+ Info->isPSInputAllocated(11))) {
+ CCInfo.AllocateReg(AMDGPU::VGPR0);
+ CCInfo.AllocateReg(AMDGPU::VGPR1);
+ Info->markPSInputAllocated(0);
+ Info->markPSInputEnabled(0);
+ }
+ if (Subtarget->isAmdPalOS()) {
+ // For isAmdPalOS, the user does not enable some bits after compilation
+ // based on run-time states; the register values being generated here are
+ // the final ones set in hardware. Therefore we need to apply the
+ // workaround to PSInputAddr and PSInputEnable together. (The case where
+ // a bit is set in PSInputAddr but not PSInputEnable is where the
+ // frontend set up an input arg for a particular interpolation mode, but
+ // nothing uses that input arg. Really we should have an earlier pass
+ // that removes such an arg.)
+ unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
+ if ((PsInputBits & 0x7F) == 0 ||
+ ((PsInputBits & 0xF) == 0 &&
+ (PsInputBits >> 11 & 1)))
+ Info->markPSInputEnabled(
+ countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
+ }
}
assert(!Info->hasDispatchPtr() &&
@@ -1308,7 +1549,7 @@ SDValue SITargetLowering::LowerFormalArguments(
}
if (IsEntryFunc) {
- allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
+ allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
}
@@ -1375,6 +1616,17 @@ SDValue SITargetLowering::LowerFormalArguments(
Reg = MF.addLiveIn(Reg, RC);
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
+ // The return object should be reasonably addressable.
+
+ // FIXME: This helps when the return is a real sret. If it is a
+ // automatically inserted sret (i.e. CanLowerReturn returns false), an
+ // extra copy is inserted in SelectionDAGBuilder which obscures this.
+ unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
+ Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
+ DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
+ }
+
// If this is an 8 or 16-bit value, it is really passed promoted
// to 32 bits. Insert an assert[sz]ext to capture this, then
// truncate to the right size.
@@ -1427,6 +1679,11 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Val);
}
+ if (!IsEntryFunc) {
+ // Special inputs come after user arguments.
+ allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
+ }
+
// Start adding system SGPRs.
if (IsEntryFunc) {
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
@@ -1434,8 +1691,16 @@ SDValue SITargetLowering::LowerFormalArguments(
CCInfo.AllocateReg(Info->getScratchRSrcReg());
CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
CCInfo.AllocateReg(Info->getFrameOffsetReg());
+ allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
+ auto &ArgUsageInfo =
+ DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
+ ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo());
+
+ unsigned StackArgSize = CCInfo.getNextStackOffset();
+ Info->setBytesInStackArgArea(StackArgSize);
+
return Chains.empty() ? Chain :
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
@@ -1575,6 +1840,22 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
}
// FIXME: Does sret work properly?
+ if (!Info->isEntryFunction()) {
+ const SIRegisterInfo *TRI
+ = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+ const MCPhysReg *I =
+ TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+ if (I) {
+ for (; *I; ++I) {
+ if (AMDGPU::SReg_64RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+ else if (AMDGPU::SReg_32RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i32));
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+ }
+ }
+ }
// Update chain and glue.
RetOps[0] = Chain;
@@ -1587,6 +1868,563 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
+SDValue SITargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
+ SDValue ThisVal) const {
+ CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign VA = RVLocs[i];
+ SDValue Val;
+
+ if (VA.isRegLoc()) {
+ Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+ Chain = Val.getValue(1);
+ InFlag = Val.getValue(2);
+ } else if (VA.isMemLoc()) {
+ report_fatal_error("TODO: return values in memory");
+ } else
+ llvm_unreachable("unknown argument location type");
+
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::ZExt:
+ Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::SExt:
+ Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::AExt:
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+// Add code to pass special inputs required depending on used features separate
+// from the explicit user arguments present in the IR.
+void SITargetLowering::passSpecialInputs(
+ CallLoweringInfo &CLI,
+ const SIMachineFunctionInfo &Info,
+ SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
+ SmallVectorImpl<SDValue> &MemOpChains,
+ SDValue Chain,
+ SDValue StackPtr) const {
+ // If we don't have a call site, this was a call inserted by
+ // legalization. These can never use special inputs.
+ if (!CLI.CS)
+ return;
+
+ const Function *CalleeFunc = CLI.CS.getCalledFunction();
+ assert(CalleeFunc);
+
+ SelectionDAG &DAG = CLI.DAG;
+ const SDLoc &DL = CLI.DL;
+
+ const SISubtarget *ST = getSubtarget();
+ const SIRegisterInfo *TRI = ST->getRegisterInfo();
+
+ auto &ArgUsageInfo =
+ DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
+ const AMDGPUFunctionArgInfo &CalleeArgInfo
+ = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
+
+ const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
+
+ // TODO: Unify with private memory register handling. This is complicated by
+ // the fact that at least in kernels, the input argument is not necessarily
+ // in the same location as the input.
+ AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
+ AMDGPUFunctionArgInfo::DISPATCH_PTR,
+ AMDGPUFunctionArgInfo::QUEUE_PTR,
+ AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
+ AMDGPUFunctionArgInfo::DISPATCH_ID,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_X,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
+ AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
+ };
+
+ for (auto InputID : InputRegs) {
+ const ArgDescriptor *OutgoingArg;
+ const TargetRegisterClass *ArgRC;
+
+ std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
+ if (!OutgoingArg)
+ continue;
+
+ const ArgDescriptor *IncomingArg;
+ const TargetRegisterClass *IncomingArgRC;
+ std::tie(IncomingArg, IncomingArgRC)
+ = CallerArgInfo.getPreloadedValue(InputID);
+ assert(IncomingArgRC == ArgRC);
+
+ // All special arguments are ints for now.
+ EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
+ SDValue InputReg;
+
+ if (IncomingArg) {
+ InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
+ } else {
+ // The implicit arg ptr is special because it doesn't have a corresponding
+ // input for kernels, and is computed from the kernarg segment pointer.
+ assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+ InputReg = getImplicitArgPtr(DAG, DL);
+ }
+
+ if (OutgoingArg->isRegister()) {
+ RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+ } else {
+ SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
+ InputReg,
+ OutgoingArg->getStackOffset());
+ MemOpChains.push_back(ArgStore);
+ }
+ }
+}
+
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+ return CC == CallingConv::Fast;
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::C:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
+bool SITargetLowering::isEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+ if (!mayTailCallThisCC(CalleeCC))
+ return false;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const Function &CallerF = MF.getFunction();
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+
+ // Kernels aren't callable, and don't have a live in return address so it
+ // doesn't make sense to do a tail call with entry functions.
+ if (!CallerPreserved)
+ return false;
+
+ bool CCMatch = CallerCC == CalleeCC;
+
+ if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+ if (canGuaranteeTCO(CalleeCC) && CCMatch)
+ return true;
+ return false;
+ }
+
+ // TODO: Can we handle var args?
+ if (IsVarArg)
+ return false;
+
+ for (const Argument &Arg : CallerF.args()) {
+ if (Arg.hasByValAttr())
+ return false;
+ }
+
+ LLVMContext &Ctx = *DAG.getContext();
+
+ // Check that the call results are passed in the same way.
+ if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
+ CCAssignFnForCall(CalleeCC, IsVarArg),
+ CCAssignFnForCall(CallerCC, IsVarArg)))
+ return false;
+
+ // The callee has to preserve all registers the caller needs to preserve.
+ if (!CCMatch) {
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+ return false;
+ }
+
+ // Nothing more to check if the callee is taking no arguments.
+ if (Outs.empty())
+ return true;
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
+
+ CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
+
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ // If the stack arguments for this call do not fit into our own save area then
+ // the call cannot be made tail.
+ // TODO: Is this really necessary?
+ if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
+ return false;
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
+}
+
+bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
+ if (!CI->isTailCall())
+ return false;
+
+ const Function *ParentFn = CI->getParent()->getParent();
+ if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
+ return false;
+
+ auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
+ return (Attr.getValueAsString() != "true");
+}
+
+// The wave scratch offset register is used as the global base pointer.
+SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ const SDLoc &DL = CLI.DL;
+ SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+ SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+ SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &IsTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+ bool IsSibCall = false;
+ bool IsThisReturn = false;
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ if (IsVarArg) {
+ return lowerUnhandledCall(CLI, InVals,
+ "unsupported call to variadic function ");
+ }
+
+ if (!CLI.CS.getCalledFunction()) {
+ return lowerUnhandledCall(CLI, InVals,
+ "unsupported indirect call to function ");
+ }
+
+ if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
+ return lowerUnhandledCall(CLI, InVals,
+ "unsupported required tail call to function ");
+ }
+
+ // The first 4 bytes are reserved for the callee's emergency stack slot.
+ const unsigned CalleeUsableStackOffset = 4;
+
+ if (IsTailCall) {
+ IsTailCall = isEligibleForTailCallOptimization(
+ Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
+ if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
+ report_fatal_error("failed to perform tail call elimination on a call "
+ "site marked musttail");
+ }
+
+ bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+
+ // A sibling call is one where we're under the usual C ABI and not planning
+ // to change that but can still do a tail call:
+ if (!TailCallOpt && IsTailCall)
+ IsSibCall = true;
+
+ if (IsTailCall)
+ ++NumTailCalls;
+ }
+
+ if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ // FIXME: Remove this hack for function pointer types after removing
+ // support of old address space mapping. In the new address space
+ // mapping the pointer in default address space is 64 bit, therefore
+ // does not need this hack.
+ if (Callee.getValueType() == MVT::i32) {
+ const GlobalValue *GV = GA->getGlobal();
+ Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false,
+ GA->getTargetFlags());
+ }
+ }
+ assert(Callee.getValueType() == MVT::i64);
+
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
+ CCInfo.AnalyzeCallOperands(Outs, AssignFn);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ if (IsSibCall) {
+ // Since we're not changing the ABI to make this a tail call, the memory
+ // operands are already available in the caller's incoming argument space.
+ NumBytes = 0;
+ }
+
+ // FPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0. Completely unused for non-tail calls.
+ int32_t FPDiff = 0;
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+ SDValue CallerSavedFP;
+
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog pass
+ if (!IsSibCall) {
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
+
+ unsigned OffsetReg = Info->getScratchWaveOffsetReg();
+
+ // In the HSA case, this should be an identity copy.
+ SDValue ScratchRSrcReg
+ = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
+ RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+
+ // TODO: Don't hardcode these registers and get from the callee function.
+ SDValue ScratchWaveOffsetReg
+ = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
+ RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
+
+ if (!Info->isEntryFunction()) {
+ // Avoid clobbering this function's FP value. In the current convention
+ // callee will overwrite this, so do save/restore around the call site.
+ CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
+ Info->getFrameOffsetReg(), MVT::i32);
+ }
+ }
+
+ // Stack pointer relative accesses are done by changing the offset SGPR. This
+ // is just the VGPR offset component.
+ SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
+
+ SmallVector<SDValue, 8> MemOpChains;
+ MVT PtrVT = MVT::i32;
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+ ++i, ++realArgIdx) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[realArgIdx];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::FPExt:
+ Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ int32_t Offset = LocMemOffset;
+
+ SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset);
+
+ if (IsTailCall) {
+ ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+ unsigned OpSize = Flags.isByVal() ?
+ Flags.getByValSize() : VA.getValVT().getStoreSize();
+
+ Offset = Offset + FPDiff;
+ int FI = MFI.CreateFixedObject(OpSize, Offset, true);
+
+ DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT),
+ StackPtr);
+ DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
+
+ // Make sure any stack arguments overlapping with where we're storing
+ // are loaded before this eventual operation. Otherwise they'll be
+ // clobbered.
+
+ // FIXME: Why is this really necessary? This seems to just result in a
+ // lot of code to copy the stack and write them back to the same
+ // locations, which are supposed to be immutable?
+ Chain = addTokenForArgument(Chain, DAG, MFI, FI);
+ } else {
+ DstAddr = PtrOff;
+ DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
+ }
+
+ if (Outs[i].Flags.isByVal()) {
+ SDValue SizeNode =
+ DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
+ SDValue Cpy = DAG.getMemcpy(
+ Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+ /*isVol = */ false, /*AlwaysInline = */ true,
+ /*isTailCall = */ false, DstInfo,
+ MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
+ *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))));
+
+ MemOpChains.push_back(Cpy);
+ } else {
+ SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
+ MemOpChains.push_back(Store);
+ }
+ }
+ }
+
+ // Copy special input registers after user input arguments.
+ passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into the appropriate regs.
+ SDValue InFlag;
+ for (auto &RegToPass : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
+ RegToPass.second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+
+ SDValue PhysReturnAddrReg;
+ if (IsTailCall) {
+ // Since the return is being combined with the call, we need to pass on the
+ // return address.
+
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+ SDValue ReturnAddrReg = CreateLiveInRegister(
+ DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
+
+ PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
+ MVT::i64);
+ Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // We don't usually want to end the call-sequence here because we would tidy
+ // the frame up *after* the call, however in the ABI-changing tail-call case
+ // we've carefully laid out the parameters so that when sp is reset they'll be
+ // in the correct location.
+ if (IsTailCall && !IsSibCall) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getTargetConstant(NumBytes, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32),
+ InFlag, DL);
+ InFlag = Chain.getValue(1);
+ }
+
+ std::vector<SDValue> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (IsTailCall) {
+ // Each tail call may have to adjust the stack by a different amount, so
+ // this information must travel along with the operation for eventual
+ // consumption by emitEpilogue.
+ Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
+
+ Ops.push_back(PhysReturnAddrReg);
+ }
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (auto &RegToPass : RegsToPass) {
+ Ops.push_back(DAG.getRegister(RegToPass.first,
+ RegToPass.second.getValueType()));
+ }
+
+ // Add a register mask operand representing the call-preserved registers.
+
+ const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ // If we're doing a tall call, use a TC_RETURN here rather than an
+ // actual call instruction.
+ if (IsTailCall) {
+ MFI.setHasTailCall();
+ return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
+ }
+
+ // Returns a chain and a flag for retval copy to use.
+ SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
+ Chain = Call.getValue(0);
+ InFlag = Call.getValue(1);
+
+ if (CallerSavedFP) {
+ SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
+ Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ uint64_t CalleePopBytes = NumBytes;
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
+ DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
+ InFlag, DL);
+ if (!Ins.empty())
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+ InVals, IsThisReturn,
+ IsThisReturn ? OutVals[0] : SDValue());
+}
+
unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
unsigned Reg = StringSwitch<unsigned>(RegName)
@@ -1644,7 +2482,7 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
if (SplitPoint == BB->end()) {
// Don't bother with a new block.
- MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+ MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
return BB;
}
@@ -1658,7 +2496,7 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
SplitBB->transferSuccessorsAndUpdatePHIs(BB);
BB->addSuccessor(SplitBB);
- MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+ MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
return SplitBB;
}
@@ -1775,8 +2613,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
MachineBasicBlock::iterator I(&MI);
unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
@@ -2121,19 +2959,66 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
if (MI.mayLoad())
Flags |= MachineMemOperand::MOLoad;
- auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
- MI.addMemOperand(*MF, MMO);
+ if (Flags != MachineMemOperand::MODereferenceable) {
+ auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
+ MI.addMemOperand(*MF, MMO);
+ }
+
return BB;
}
switch (MI.getOpcode()) {
- case AMDGPU::SI_INIT_M0:
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ MachineOperand &Dest = MI.getOperand(0);
+ MachineOperand &Src0 = MI.getOperand(1);
+ MachineOperand &Src1 = MI.getOperand(2);
+
+ unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
+ Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+ &AMDGPU::SReg_32_XM0RegClass);
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
+ Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+ &AMDGPU::SReg_32_XM0RegClass);
+
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
+ Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+ &AMDGPU::SReg_32_XM0RegClass);
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
+ Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+ &AMDGPU::SReg_32_XM0RegClass);
+
+ bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
+
+ unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+ unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
+ .add(Src0Sub0)
+ .add(Src1Sub0);
+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
+ .add(Src0Sub1)
+ .add(Src1Sub1);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case AMDGPU::SI_INIT_M0: {
BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.add(MI.getOperand(0));
MI.eraseFromParent();
return BB;
-
+ }
case AMDGPU::SI_INIT_EXEC:
// This should be before all vector instructions.
BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
@@ -2212,7 +3097,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::SI_INDIRECT_DST_V8:
case AMDGPU::SI_INDIRECT_DST_V16:
return emitIndirectDst(MI, *BB, *getSubtarget());
- case AMDGPU::SI_KILL:
+ case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
+ case AMDGPU::SI_KILL_I1_PSEUDO:
return splitKillBlock(MI, BB);
case AMDGPU::V_CNDMASK_B64_PSEUDO: {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
@@ -2225,15 +3111,18 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
+ .addReg(SrcCond);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
.addReg(Src0, 0, AMDGPU::sub0)
.addReg(Src1, 0, AMDGPU::sub0)
- .addReg(SrcCond);
+ .addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
.addReg(Src0, 0, AMDGPU::sub1)
.addReg(Src1, 0, AMDGPU::sub1)
- .addReg(SrcCond);
+ .addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
.addReg(DstLo)
@@ -2252,11 +3141,57 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::ADJCALLSTACKUP:
+ case AMDGPU::ADJCALLSTACKDOWN: {
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+ MachineInstrBuilder MIB(*MF, &MI);
+ MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
+ .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
+ return BB;
+ }
+ case AMDGPU::SI_CALL_ISEL:
+ case AMDGPU::SI_TCRETURN_ISEL: {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned GlobalAddrReg = MI.getOperand(0).getReg();
+ MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
+ assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
+
+ const GlobalValue *G = PCRel->getOperand(1).getGlobal();
+
+ MachineInstrBuilder MIB;
+ if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
+ MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
+ .add(MI.getOperand(0))
+ .addGlobalAddress(G);
+ } else {
+ MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
+ .add(MI.getOperand(0))
+ .addGlobalAddress(G);
+
+ // There is an additional imm operand for tcreturn, but it should be in the
+ // right place already.
+ }
+
+ for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
+
+ MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MI.eraseFromParent();
+ return BB;
+ }
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
}
+bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
+ return isTypeLegal(VT.getScalarType());
+}
+
bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
// This currently forces unfolding various combinations of fsub into fma with
// free fneg'd operands. As long as we have fast FMA (controlled by
@@ -2356,7 +3291,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::FP_ROUND:
return lowerFP_ROUND(Op, DAG);
-
case ISD::TRAP:
case ISD::DEBUGTRAP:
return lowerTRAP(Op, DAG);
@@ -2660,11 +3594,11 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
case SISubtarget::TrapIDLLVMTrap:
return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
case SISubtarget::TrapIDLLVMDebugTrap: {
- DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
+ DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
"debugtrap handler not supported",
Op.getDebugLoc(),
DS_Warning);
- LLVMContext &Ctx = MF.getFunction()->getContext();
+ LLVMContext &Ctx = MF.getFunction().getContext();
Ctx.diagnose(NoTrap);
return Chain;
}
@@ -2709,8 +3643,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// private_segment_aperture_base_hi.
uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
- SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr,
- DAG.getConstant(StructOffset, DL, MVT::i64));
+ SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
// TODO: Use custom target PseudoSourceValue.
// TODO: We should use the value from the IR intrinsic call, but it might not
@@ -2778,7 +3711,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
const MachineFunction &MF = DAG.getMachineFunction();
DiagnosticInfoUnsupported InvalidAddrSpaceCast(
- *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
+ MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
DAG.getContext()->diagnose(InvalidAddrSpaceCast);
return DAG.getUNDEF(ASC->getValueType(0));
@@ -2917,13 +3850,16 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = GSD->getGlobal();
if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
- GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS)
+ GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
+ // FIXME: It isn't correct to rely on the type of the pointer. This should
+ // be removed when address space 0 is 64-bit.
+ !GV->getType()->getElementType()->isFunctionTy())
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
SDLoc DL(GSD);
- const GlobalValue *GV = GSD->getGlobal();
EVT PtrVT = Op.getValueType();
if (shouldEmitFixup(GV))
@@ -2977,7 +3913,7 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
EVT VT) {
- DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+ DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
"non-hsa intrinsic with hsa target",
DL.getDebugLoc());
DAG.getContext()->diagnose(BadIntrin);
@@ -2986,7 +3922,7 @@ static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
EVT VT) {
- DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+ DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
"intrinsic not supported on subtarget",
DL.getDebugLoc());
DAG.getContext()->diagnose(BadIntrin);
@@ -2997,7 +3933,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
auto MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
EVT VT = Op.getValueType();
SDLoc DL(Op);
@@ -3009,38 +3944,35 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_implicit_buffer_ptr: {
if (getSubtarget()->isAmdCodeObjectV2(MF))
return emitNonHSAIntrinsicError(DAG, DL, VT);
-
- unsigned Reg = TRI->getPreloadedValue(MF,
- SIRegisterInfo::IMPLICIT_BUFFER_PTR);
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
}
case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr: {
if (!Subtarget->isAmdCodeObjectV2(MF)) {
DiagnosticInfoUnsupported BadIntrin(
- *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
+ MF.getFunction(), "unsupported hsa intrinsic without hsa target",
DL.getDebugLoc());
DAG.getContext()->diagnose(BadIntrin);
return DAG.getUNDEF(VT);
}
- auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
- SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
- TRI->getPreloadedValue(MF, Reg), VT);
+ auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
+ AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
+ return getPreloadedValue(DAG, *MFI, VT, RegID);
}
case Intrinsic::amdgcn_implicitarg_ptr: {
- unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
- return lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
+ if (MFI->isEntryFunction())
+ return getImplicitArgPtr(DAG, DL);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
}
case Intrinsic::amdgcn_kernarg_segment_ptr: {
- unsigned Reg
- = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
}
case Intrinsic::amdgcn_dispatch_id: {
- unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID);
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
}
case Intrinsic::amdgcn_rcp:
return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
@@ -3125,28 +4057,32 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SI::KernelInputOffsets::LOCAL_SIZE_Z);
case Intrinsic::amdgcn_workgroup_id_x:
case Intrinsic::r600_read_tgid_x:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
case Intrinsic::amdgcn_workgroup_id_y:
case Intrinsic::r600_read_tgid_y:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
case Intrinsic::amdgcn_workgroup_id_z:
case Intrinsic::r600_read_tgid_z:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
- case Intrinsic::amdgcn_workitem_id_x:
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+ case Intrinsic::amdgcn_workitem_id_x: {
case Intrinsic::r600_read_tidig_x:
- return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
+ return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
+ SDLoc(DAG.getEntryNode()),
+ MFI->getArgInfo().WorkItemIDX);
+ }
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
- return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
+ return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
+ SDLoc(DAG.getEntryNode()),
+ MFI->getArgInfo().WorkItemIDY);
case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::r600_read_tidig_z:
- return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
+ return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
+ SDLoc(DAG.getEntryNode()),
+ MFI->getArgInfo().WorkItemIDZ);
case AMDGPUIntrinsic::SI_load_const: {
SDValue Ops[] = {
Op.getOperand(1),
@@ -3193,7 +4129,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return SDValue();
DiagnosticInfoUnsupported BadIntrin(
- *MF.getFunction(), "intrinsic not supported on subtarget",
+ MF.getFunction(), "intrinsic not supported on subtarget",
DL.getDebugLoc());
DAG.getContext()->diagnose(BadIntrin);
return DAG.getUNDEF(VT);
@@ -3224,7 +4160,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// 3rd parameter required to be a constant.
const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
if (!Param)
- return DAG.getUNDEF(VT);
+ return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
// Translate to the operands expected by the machine instruction. The
// first parameter must be the same as the first instruction.
@@ -3292,6 +4228,26 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
}
+ case Intrinsic::amdgcn_wqm: {
+ SDValue Src = Op.getOperand(1);
+ return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
+ 0);
+ }
+ case Intrinsic::amdgcn_wwm: {
+ SDValue Src = Op.getOperand(1);
+ return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
+ 0);
+ }
+ case Intrinsic::amdgcn_image_getlod:
+ case Intrinsic::amdgcn_image_getresinfo: {
+ unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4;
+
+ // Replace dmask with everything disabled with undef.
+ const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx));
+ if (!DMask || DMask->isNullValue())
+ return DAG.getUNDEF(Op.getValueType());
+ return SDValue();
+ }
default:
return Op;
}
@@ -3365,6 +4321,95 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
Op->getVTList(), Ops, VT, MMO);
}
+ case Intrinsic::amdgcn_buffer_atomic_swap:
+ case Intrinsic::amdgcn_buffer_atomic_add:
+ case Intrinsic::amdgcn_buffer_atomic_sub:
+ case Intrinsic::amdgcn_buffer_atomic_smin:
+ case Intrinsic::amdgcn_buffer_atomic_umin:
+ case Intrinsic::amdgcn_buffer_atomic_smax:
+ case Intrinsic::amdgcn_buffer_atomic_umax:
+ case Intrinsic::amdgcn_buffer_atomic_and:
+ case Intrinsic::amdgcn_buffer_atomic_or:
+ case Intrinsic::amdgcn_buffer_atomic_xor: {
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // vdata
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ Op.getOperand(5), // offset
+ Op.getOperand(6) // slc
+ };
+ EVT VT = Op.getOperand(3).getValueType();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOVolatile,
+ VT.getStoreSize(), 4);
+ unsigned Opcode = 0;
+
+ switch (IntrID) {
+ case Intrinsic::amdgcn_buffer_atomic_swap:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_add:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_sub:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_smin:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_umin:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_smax:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_umax:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_and:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_or:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_xor:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
+ break;
+ default:
+ llvm_unreachable("unhandled atomic opcode");
+ }
+
+ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
+ }
+
+ case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // src
+ Op.getOperand(3), // cmp
+ Op.getOperand(4), // rsrc
+ Op.getOperand(5), // vindex
+ Op.getOperand(6), // offset
+ Op.getOperand(7) // slc
+ };
+ EVT VT = Op.getOperand(4).getValueType();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOVolatile,
+ VT.getStoreSize(), 4);
+
+ return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
+ Op->getVTList(), Ops, VT, MMO);
+ }
+
// Basic sample.
case Intrinsic::amdgcn_image_sample:
case Intrinsic::amdgcn_image_sample_cl:
@@ -3411,9 +4456,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_image_sample_c_b_cl_o:
case Intrinsic::amdgcn_image_sample_c_lz_o:
case Intrinsic::amdgcn_image_sample_c_cd_o:
- case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
-
- case Intrinsic::amdgcn_image_getlod: {
+ case Intrinsic::amdgcn_image_sample_c_cd_cl_o: {
// Replace dmask with everything disabled with undef.
const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
if (!DMask || DMask->isNullValue()) {
@@ -3516,7 +4559,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::amdgcn_s_barrier: {
if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
+ unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
if (WGSize <= ST.getWavefrontSize())
return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
Op.getOperand(0)), 0);
@@ -3592,6 +4635,30 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op->getVTList(), Ops, VT, MMO);
}
+ case Intrinsic::amdgcn_buffer_store:
+ case Intrinsic::amdgcn_buffer_store_format: {
+ SDValue Ops[] = {
+ Chain,
+ Op.getOperand(2), // vdata
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ Op.getOperand(5), // offset
+ Op.getOperand(6), // glc
+ Op.getOperand(7) // slc
+ };
+ EVT VT = Op.getOperand(3).getValueType();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable,
+ VT.getStoreSize(), 4);
+
+ unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
+ AMDGPUISD::BUFFER_STORE :
+ AMDGPUISD::BUFFER_STORE_FORMAT;
+ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
+ }
+
default:
return Op;
}
@@ -3604,6 +4671,9 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
EVT MemVT = Load->getMemoryVT();
if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
+ if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
+ return SDValue();
+
// FIXME: Copied from PPC
// First, load into 32 bits, then truncate to 1 bit.
@@ -4187,32 +5257,6 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
return SDValue();
}
-/// \brief Return true if the given offset Size in bytes can be folded into
-/// the immediate offsets of a memory instruction for the given address space.
-static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
- const SISubtarget &STI) {
- auto AMDGPUASI = STI.getAMDGPUAS();
- if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
- // MUBUF instructions a 12-bit offset in bytes.
- return isUInt<12>(OffsetSize);
- }
- if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
- // SMRD instructions have an 8-bit offset in dwords on SI and
- // a 20-bit offset in bytes on VI.
- if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
- return isUInt<20>(OffsetSize);
- else
- return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
- }
- if (AS == AMDGPUASI.LOCAL_ADDRESS ||
- AS == AMDGPUASI.REGION_ADDRESS) {
- // The single offset versions have a 16-bit offset in bytes.
- return isUInt<16>(OffsetSize);
- }
- // Indirect register addressing does not use any offsets.
- return false;
-}
-
// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
// This is a variant of
@@ -4229,11 +5273,15 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
//
SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
unsigned AddrSpace,
+ EVT MemVT,
DAGCombinerInfo &DCI) const {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- if (N0.getOpcode() != ISD::ADD)
+ // We only do this to handle cases where it's profitable when there are
+ // multiple uses of the add, so defer to the standard combine.
+ if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
+ N0->hasOneUse())
return SDValue();
const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
@@ -4247,7 +5295,12 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
// If the resulting offset is too large, we can't fold it into the addressing
// mode offset.
APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
- if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
+ Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
+
+ AddrMode AM;
+ AM.HasBaseReg = true;
+ AM.BaseOffs = Offset.getSExtValue();
+ if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -4257,7 +5310,12 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
- return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
+ SDNodeFlags Flags;
+ Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
+ (N0.getOpcode() == ISD::OR ||
+ N0->getFlags().hasNoUnsignedWrap()));
+
+ return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
}
SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
@@ -4267,9 +5325,9 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
SDLoc SL(N);
// TODO: We could also do this for multiplies.
- unsigned AS = N->getAddressSpace();
- if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUASI.PRIVATE_ADDRESS) {
- SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
+ if (Ptr.getOpcode() == ISD::SHL) {
+ SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
+ N->getMemoryVT(), DCI);
if (NewPtr) {
SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
@@ -4818,15 +5876,27 @@ SDValue SITargetLowering::performIntMed3ImmCombine(
return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
}
+static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
+ return C;
+
+ if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
+ if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
+ return C;
+ }
+
+ return nullptr;
+}
+
SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Op0,
SDValue Op1) const {
- ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
+ ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
if (!K1)
return SDValue();
- ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1));
+ ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
if (!K0)
return SDValue();
@@ -4836,7 +5906,7 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
return SDValue();
// TODO: Check IEEE bit enabled?
- EVT VT = K0->getValueType(0);
+ EVT VT = Op0.getValueType();
if (Subtarget->enableDX10Clamp()) {
// If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
// hardware fmed3 behavior converting to a min.
@@ -4845,19 +5915,21 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
}
- // med3 for f16 is only available on gfx9+.
- if (VT == MVT::f64 || (VT == MVT::f16 && !Subtarget->hasMed3_16()))
- return SDValue();
+ // med3 for f16 is only available on gfx9+, and not available for v2f16.
+ if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
+ // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
+ // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
+ // then give the other result, which is different from med3 with a NaN
+ // input.
+ SDValue Var = Op0.getOperand(0);
+ if (!isKnownNeverSNan(DAG, Var))
+ return SDValue();
- // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
- // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
- // give the other result, which is different from med3 with a NaN input.
- SDValue Var = Op0.getOperand(0);
- if (!isKnownNeverSNan(DAG, Var))
- return SDValue();
+ return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
+ Var, SDValue(K0, 0), SDValue(K1, 0));
+ }
- return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
- Var, SDValue(K0, 0), SDValue(K1, 0));
+ return SDValue();
}
SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
@@ -4918,7 +5990,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
(Opc == AMDGPUISD::FMIN_LEGACY &&
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
(VT == MVT::f32 || VT == MVT::f64 ||
- (VT == MVT::f16 && Subtarget->has16BitInsts())) &&
+ (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
+ (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
Op0.hasOneUse()) {
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
return Res;
@@ -4994,7 +6067,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
SDNode *N, DAGCombinerInfo &DCI) const {
SDValue Vec = N->getOperand(0);
- SelectionDAG &DAG= DCI.DAG;
+ SelectionDAG &DAG = DCI.DAG;
if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
SDLoc SL(N);
EVT EltVT = N->getValueType(0);
@@ -5007,6 +6080,47 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
return SDValue();
}
+static bool convertBuildVectorCastElt(SelectionDAG &DAG,
+ SDValue &Lo, SDValue &Hi) {
+ if (Hi.getOpcode() == ISD::BITCAST &&
+ Hi.getOperand(0).getValueType() == MVT::f16 &&
+ (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
+ Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
+ Hi = Hi.getOperand(0);
+ return true;
+ }
+
+ return false;
+}
+
+SDValue SITargetLowering::performBuildVectorCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SDLoc SL(N);
+
+ if (!isTypeLegal(MVT::v2i16))
+ return SDValue();
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ if (VT == MVT::v2i16) {
+ SDValue Lo = N->getOperand(0);
+ SDValue Hi = N->getOperand(1);
+
+ // v2i16 build_vector (const|undef), (bitcast f16:$x)
+ // -> bitcast (v2f16 build_vector const|undef, $x
+ if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
+ SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi });
+ return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
+ }
+
+ if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
+ SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo });
+ return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
+ }
+ }
+
+ return SDValue();
+}
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0,
@@ -5030,18 +6144,57 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
return 0;
}
+static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
+ EVT VT,
+ SDValue N0, SDValue N1, SDValue N2,
+ bool Signed) {
+ unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
+ SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
+ SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
+}
+
SDValue SITargetLowering::performAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
-
- if (VT != MVT::i32)
- return SDValue();
-
SDLoc SL(N);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
+ if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
+ && Subtarget->hasMad64_32() &&
+ !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
+ VT.getScalarSizeInBits() <= 64) {
+ if (LHS.getOpcode() != ISD::MUL)
+ std::swap(LHS, RHS);
+
+ SDValue MulLHS = LHS.getOperand(0);
+ SDValue MulRHS = LHS.getOperand(1);
+ SDValue AddRHS = RHS;
+
+ // TODO: Maybe restrict if SGPR inputs.
+ if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
+ numBitsUnsigned(MulRHS, DAG) <= 32) {
+ MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
+ MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
+ AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
+ return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
+ }
+
+ if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
+ MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
+ MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
+ AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
+ return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
+ }
+
+ return SDValue();
+ }
+
+ if (VT != MVT::i32)
+ return SDValue();
+
// add x, zext (setcc) => addcarry x, 0, setcc
// add x, sext (setcc) => subcarry x, 0, setcc
unsigned Opc = LHS.getOpcode();
@@ -5428,6 +6581,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
}
case ISD::EXTRACT_VECTOR_ELT:
return performExtractVectorEltCombine(N, DCI);
+ case ISD::BUILD_VECTOR:
+ return performBuildVectorCombine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
@@ -5444,13 +6599,19 @@ static unsigned SubIdx2Lane(unsigned Idx) {
}
/// \brief Adjust the writemask of MIMG instructions
-void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
- SelectionDAG &DAG) const {
- SDNode *Users[4] = { };
+SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
+ SelectionDAG &DAG) const {
+ SDNode *Users[4] = { nullptr };
unsigned Lane = 0;
unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
unsigned NewDmask = 0;
+ bool HasChain = Node->getNumValues() > 1;
+
+ if (OldDmask == 0) {
+ // These are folded out, but on the chance it happens don't assert.
+ return Node;
+ }
// Try to figure out the used register components
for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
@@ -5463,9 +6624,9 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// Abort if we can't understand the usage
if (!I->isMachineOpcode() ||
I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
- return;
+ return Node;
- // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
+ // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
// Note that subregs are packed, i.e. Lane==0 is the first bit set
// in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
// set, etc.
@@ -5474,14 +6635,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// Set which texture component corresponds to the lane.
unsigned Comp;
for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
- assert(Dmask);
Comp = countTrailingZeros(Dmask);
Dmask &= ~(1 << Comp);
}
// Abort if we have more than one user per component
if (Users[Lane])
- return;
+ return Node;
Users[Lane] = *I;
NewDmask |= 1 << Comp;
@@ -5489,25 +6649,47 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// Abort if there's no change
if (NewDmask == OldDmask)
- return;
+ return Node;
+
+ unsigned BitsSet = countPopulation(NewDmask);
+
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII,
+ Node->getMachineOpcode(), BitsSet);
+ assert(NewOpcode != -1 &&
+ NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
+ "failed to find equivalent MIMG op");
// Adjust the writemask in the node
- std::vector<SDValue> Ops;
+ SmallVector<SDValue, 12> Ops;
Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
- Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
-
- // If we only got one lane, replace it with a copy
- // (if NewDmask has only one bit set...)
- if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
- SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
- MVT::i32);
- SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
- SDLoc(), Users[Lane]->getValueType(0),
- SDValue(Node, 0), RC);
+
+ MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
+
+ MVT ResultVT = BitsSet == 1 ?
+ SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
+ SDVTList NewVTList = HasChain ?
+ DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
+
+
+ MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
+ NewVTList, Ops);
+
+ if (HasChain) {
+ // Update chain.
+ NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
+ }
+
+ if (BitsSet == 1) {
+ assert(Node->hasNUsesOfValue(1, 0));
+ SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
+ SDLoc(Node), Users[Lane]->getValueType(0),
+ SDValue(NewNode, 0));
DAG.ReplaceAllUsesWith(Users[Lane], Copy);
- return;
+ return nullptr;
}
// Update the users of the node with the new indices
@@ -5517,7 +6699,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
continue;
SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
- DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
+ DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
switch (Idx) {
default: break;
@@ -5526,6 +6708,9 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
}
}
+
+ DAG.RemoveDeadNode(Node);
+ return nullptr;
}
static bool isFrameIndexOp(SDValue Op) {
@@ -5579,25 +6764,80 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
Node->getOperand(i)), 0));
}
- DAG.UpdateNodeOperands(Node, Ops);
- return Node;
+ return DAG.UpdateNodeOperands(Node, Ops);
}
/// \brief Fold the instructions after selecting them.
+/// Returns null if users were already updated.
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
unsigned Opcode = Node->getMachineOpcode();
if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
- !TII->isGather4(Opcode))
- adjustWritemask(Node, DAG);
+ !TII->isGather4(Opcode)) {
+ return adjustWritemask(Node, DAG);
+ }
if (Opcode == AMDGPU::INSERT_SUBREG ||
Opcode == AMDGPU::REG_SEQUENCE) {
legalizeTargetIndependentNode(Node, DAG);
return Node;
}
+
+ switch (Opcode) {
+ case AMDGPU::V_DIV_SCALE_F32:
+ case AMDGPU::V_DIV_SCALE_F64: {
+ // Satisfy the operand register constraint when one of the inputs is
+ // undefined. Ordinarily each undef value will have its own implicit_def of
+ // a vreg, so force these to use a single register.
+ SDValue Src0 = Node->getOperand(0);
+ SDValue Src1 = Node->getOperand(1);
+ SDValue Src2 = Node->getOperand(2);
+
+ if ((Src0.isMachineOpcode() &&
+ Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
+ (Src0 == Src1 || Src0 == Src2))
+ break;
+
+ MVT VT = Src0.getValueType().getSimpleVT();
+ const TargetRegisterClass *RC = getRegClassFor(VT);
+
+ MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+ SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
+
+ SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
+ UndefReg, Src0, SDValue());
+
+ // src0 must be the same register as src1 or src2, even if the value is
+ // undefined, so make sure we don't violate this constraint.
+ if (Src0.isMachineOpcode() &&
+ Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
+ if (Src1.isMachineOpcode() &&
+ Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
+ Src0 = Src1;
+ else if (Src2.isMachineOpcode() &&
+ Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
+ Src0 = Src2;
+ else {
+ assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
+ Src0 = UndefReg;
+ Src1 = UndefReg;
+ }
+ } else
+ break;
+
+ SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
+ for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
+ Ops.push_back(Node->getOperand(I));
+
+ Ops.push_back(ImpDef.getValue(1));
+ return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+ }
+ default:
+ break;
+ }
+
return Node;
}
@@ -5615,31 +6855,6 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
return;
}
- if (TII->isMIMG(MI)) {
- unsigned VReg = MI.getOperand(0).getReg();
- const TargetRegisterClass *RC = MRI.getRegClass(VReg);
- // TODO: Need mapping tables to handle other cases (register classes).
- if (RC != &AMDGPU::VReg_128RegClass)
- return;
-
- unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
- unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
- unsigned BitsSet = 0;
- for (unsigned i = 0; i < 4; ++i)
- BitsSet += Writemask & (1 << i) ? 1 : 0;
- switch (BitsSet) {
- default: return;
- case 1: RC = &AMDGPU::VGPR_32RegClass; break;
- case 2: RC = &AMDGPU::VReg_64RegClass; break;
- case 3: RC = &AMDGPU::VReg_96RegClass; break;
- }
-
- unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
- MI.setDesc(TII->get(NewOpcode));
- MRI.setRegClass(VReg, RC);
- return;
- }
-
// Replace unused atomics with the no return version.
int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
if (NoRetAtomicOp != -1) {
@@ -5870,3 +7085,21 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
TargetLoweringBase::finalizeLowering(MF);
}
+
+void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
+ DAG, Depth);
+
+ if (getSubtarget()->enableHugePrivateBuffer())
+ return;
+
+ // Technically it may be possible to have a dispatch with a single workitem
+ // that uses the full private memory size, but that's not really useful. We
+ // can't use vaddr in MUBUF instructions if we don't know the address
+ // calculation won't overflow, so assume the sign bit is never set.
+ Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index e6bb3d6cd419..b48e67f7563a 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -16,6 +16,7 @@
#define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
#include "AMDGPUISelLowering.h"
+#include "AMDGPUArgumentUsageInfo.h"
#include "SIInstrInfo.h"
namespace llvm {
@@ -23,6 +24,7 @@ namespace llvm {
class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
SDValue Chain, uint64_t Offset) const;
+ SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
uint64_t Offset, bool Signed,
@@ -31,6 +33,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
const SDLoc &SL, SDValue Chain,
const ISD::InputArg &Arg) const;
+ SDValue getPreloadedValue(SelectionDAG &DAG,
+ const SIMachineFunctionInfo &MFI,
+ EVT VT,
+ AMDGPUFunctionArgInfo::PreloadedValue) const;
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
@@ -76,12 +82,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
- void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+ SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
SDValue performUCharToFloatCombine(SDNode *N,
DAGCombinerInfo &DCI) const;
SDValue performSHLPtrCombine(SDNode *N,
unsigned AS,
+ EVT MemVT,
DAGCombinerInfo &DCI) const;
SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const;
@@ -105,6 +112,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
@@ -117,6 +125,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
+ bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
unsigned isCFIntrinsic(const SDNode *Intr) const;
@@ -140,10 +149,10 @@ public:
const SISubtarget *getSubtarget() const;
- bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
- EVT /*VT*/) const override;
+ bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override;
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
+ MachineFunction &MF,
unsigned IntrinsicID) const override;
bool getAddrModeArguments(IntrinsicInst * /*I*/,
@@ -151,7 +160,8 @@ public:
Type *&/*AccessTy*/) const override;
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
- unsigned AS) const override;
+ unsigned AS,
+ Instruction *I = nullptr) const override;
bool canMergeStoresTo(unsigned AS, EVT MemVT,
const SelectionDAG &DAG) const override;
@@ -181,6 +191,12 @@ public:
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+ bool supportSplitCSR(MachineFunction *MF) const override;
+ void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+ void insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -197,6 +213,32 @@ public:
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
+ void passSpecialInputs(
+ CallLoweringInfo &CLI,
+ const SIMachineFunctionInfo &Info,
+ SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
+ SmallVectorImpl<SDValue> &MemOpChains,
+ SDValue Chain,
+ SDValue StackPtr) const;
+
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+ SDValue ThisVal) const;
+
+ bool mayBeEmittedAsTailCall(const CallInst *) const override;
+
+ bool isEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
unsigned getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const override;
@@ -206,6 +248,8 @@ public:
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const override;
+
+ bool hasBitPreservingFPLogic(EVT VT) const override;
bool enableAggressiveFMAFusion(EVT VT) const override;
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
@@ -234,6 +278,12 @@ public:
SDValue V) const;
void finalizeLowering(MachineFunction &MF) const override;
+
+ void computeKnownBitsForFrameIndex(const SDValue Op,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp
index ba346d2fad02..a2f844d7854e 100644
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -132,6 +132,16 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
return true;
+ // V_READFIRSTLANE/V_READLANE destination register may be used as operand
+ // by some SALU instruction. If exec mask is zero vector instruction
+ // defining the register that is used by the scalar one is not executed
+ // and scalar instruction will operate on undefined data. For
+ // V_READFIRSTLANE/V_READLANE we should avoid predicated execution.
+ if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) ||
+ (I->getOpcode() == AMDGPU::V_READLANE_B32)) {
+ return true;
+ }
+
if (I->isInlineAsm()) {
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
const char *AsmStr = I->getOperand(0).getSymbolName();
@@ -156,7 +166,7 @@ bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction *MF = MBB.getParent();
- if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
+ if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
!shouldSkip(MBB, MBB.getParent()->back()))
return false;
@@ -190,25 +200,101 @@ bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
void SIInsertSkips::kill(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
- const MachineOperand &Op = MI.getOperand(0);
-
-#ifndef NDEBUG
- CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
- // Kill is only allowed in pixel / geometry shaders.
- assert(CallConv == CallingConv::AMDGPU_PS ||
- CallConv == CallingConv::AMDGPU_GS);
-#endif
- // Clear this thread from the exec mask if the operand is negative.
- if (Op.isImm()) {
- // Constant operand: Set exec mask to 0 or do nothing
- if (Op.getImm() & 0x80000000) {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
- .addImm(0);
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
+ unsigned Opcode = 0;
+
+ // The opcodes are inverted because the inline immediate has to be
+ // the first operand, e.g. from "x < imm" to "imm > x"
+ switch (MI.getOperand(2).getImm()) {
+ case ISD::SETOEQ:
+ case ISD::SETEQ:
+ Opcode = AMDGPU::V_CMPX_EQ_F32_e32;
+ break;
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ Opcode = AMDGPU::V_CMPX_LT_F32_e32;
+ break;
+ case ISD::SETOGE:
+ case ISD::SETGE:
+ Opcode = AMDGPU::V_CMPX_LE_F32_e32;
+ break;
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ Opcode = AMDGPU::V_CMPX_GT_F32_e32;
+ break;
+ case ISD::SETOLE:
+ case ISD::SETLE:
+ Opcode = AMDGPU::V_CMPX_GE_F32_e32;
+ break;
+ case ISD::SETONE:
+ case ISD::SETNE:
+ Opcode = AMDGPU::V_CMPX_LG_F32_e32;
+ break;
+ case ISD::SETO:
+ Opcode = AMDGPU::V_CMPX_O_F32_e32;
+ break;
+ case ISD::SETUO:
+ Opcode = AMDGPU::V_CMPX_U_F32_e32;
+ break;
+ case ISD::SETUEQ:
+ Opcode = AMDGPU::V_CMPX_NLG_F32_e32;
+ break;
+ case ISD::SETUGT:
+ Opcode = AMDGPU::V_CMPX_NGE_F32_e32;
+ break;
+ case ISD::SETUGE:
+ Opcode = AMDGPU::V_CMPX_NGT_F32_e32;
+ break;
+ case ISD::SETULT:
+ Opcode = AMDGPU::V_CMPX_NLE_F32_e32;
+ break;
+ case ISD::SETULE:
+ Opcode = AMDGPU::V_CMPX_NLT_F32_e32;
+ break;
+ case ISD::SETUNE:
+ Opcode = AMDGPU::V_CMPX_NEQ_F32_e32;
+ break;
+ default:
+ llvm_unreachable("invalid ISD:SET cond code");
}
- } else {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
- .addImm(0)
+
+ // TODO: Allow this:
+ if (!MI.getOperand(0).isReg() ||
+ !TRI->isVGPR(MBB.getParent()->getRegInfo(),
+ MI.getOperand(0).getReg()))
+ llvm_unreachable("SI_KILL operand should be a VGPR");
+
+ BuildMI(MBB, &MI, DL, TII->get(Opcode))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(0));
+ break;
+ }
+ case AMDGPU::SI_KILL_I1_TERMINATOR: {
+ const MachineOperand &Op = MI.getOperand(0);
+ int64_t KillVal = MI.getOperand(1).getImm();
+ assert(KillVal == 0 || KillVal == -1);
+
+ // Kill all threads if Op0 is an immediate and equal to the Kill value.
+ if (Op.isImm()) {
+ int64_t Imm = Op.getImm();
+ assert(Imm == 0 || Imm == -1);
+
+ if (Imm == KillVal)
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addImm(0);
+ break;
+ }
+
+ unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
+ BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
.add(Op);
+ break;
+ }
+ default:
+ llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
}
}
@@ -301,7 +387,8 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
}
break;
- case AMDGPU::SI_KILL_TERMINATOR:
+ case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+ case AMDGPU::SI_KILL_I1_TERMINATOR:
MadeChange = true;
kill(MI);
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0f009a48754a..6bbe5979316d 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1,4 +1,4 @@
-//===-- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===/
+//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -21,12 +21,34 @@
#include "SIDefines.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <utility>
+#include <vector>
#define DEBUG_TYPE "si-insert-waitcnts"
@@ -42,7 +64,7 @@ namespace {
enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
-typedef std::pair<signed, signed> RegInterval;
+using RegInterval = std::pair<signed, signed>;
struct {
int32_t VmcntMax;
@@ -101,6 +123,15 @@ enum RegisterMapping {
// "s_waitcnt 0" before use.
class BlockWaitcntBrackets {
public:
+ BlockWaitcntBrackets() {
+ for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+ T = (enum InstCounterType)(T + 1)) {
+ memset(VgprScores[T], 0, sizeof(VgprScores[T]));
+ }
+ }
+
+ ~BlockWaitcntBrackets() = default;
+
static int32_t getWaitCountMax(InstCounterType T) {
switch (T) {
case VM_CNT:
@@ -113,14 +144,14 @@ public:
break;
}
return 0;
- };
+ }
void setScoreLB(InstCounterType T, int32_t Val) {
assert(T < NUM_INST_CNTS);
if (T >= NUM_INST_CNTS)
return;
ScoreLBs[T] = Val;
- };
+ }
void setScoreUB(InstCounterType T, int32_t Val) {
assert(T < NUM_INST_CNTS);
@@ -132,21 +163,21 @@ public:
if (ScoreLBs[T] < UB)
ScoreLBs[T] = UB;
}
- };
+ }
int32_t getScoreLB(InstCounterType T) {
assert(T < NUM_INST_CNTS);
if (T >= NUM_INST_CNTS)
return 0;
return ScoreLBs[T];
- };
+ }
int32_t getScoreUB(InstCounterType T) {
assert(T < NUM_INST_CNTS);
if (T >= NUM_INST_CNTS)
return 0;
return ScoreUBs[T];
- };
+ }
// Mapping from event to counter.
InstCounterType eventCounter(WaitEventType E) {
@@ -218,26 +249,18 @@ public:
void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
int32_t getMaxVGPR() const { return VgprUB; }
int32_t getMaxSGPR() const { return SgprUB; }
+
int32_t getEventUB(enum WaitEventType W) const {
assert(W < NUM_WAIT_EVENTS);
return EventUBs[W];
}
+
bool counterOutOfOrder(InstCounterType T);
unsigned int updateByWait(InstCounterType T, int ScoreToWait);
void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI, WaitEventType E,
MachineInstr &MI);
- BlockWaitcntBrackets()
- : WaitAtBeginning(false), RevisitLoop(false), ValidLoop(false), MixedExpTypes(false),
- LoopRegion(NULL), PostOrder(0), Waitcnt(NULL), VgprUB(0), SgprUB(0) {
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
- memset(VgprScores[T], 0, sizeof(VgprScores[T]));
- }
- }
- ~BlockWaitcntBrackets(){};
-
bool hasPendingSMEM() const {
return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
@@ -266,7 +289,7 @@ public:
int32_t getPostOrder() const { return PostOrder; }
void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
- void clearWaitcnt() { Waitcnt = NULL; }
+ void clearWaitcnt() { Waitcnt = nullptr; }
MachineInstr *getWaitcnt() const { return Waitcnt; }
bool mixedExpTypes() const { return MixedExpTypes; }
@@ -278,13 +301,11 @@ public:
void dump() { print(dbgs()); }
private:
- bool WaitAtBeginning;
- bool RevisitLoop;
- bool ValidLoop;
- bool MixedExpTypes;
- MachineLoop *LoopRegion;
- int32_t PostOrder;
- MachineInstr *Waitcnt;
+ bool WaitAtBeginning = false;
+ bool RevisitLoop = false;
+ bool MixedExpTypes = false;
+ int32_t PostOrder = 0;
+ MachineInstr *Waitcnt = nullptr;
int32_t ScoreLBs[NUM_INST_CNTS] = {0};
int32_t ScoreUBs[NUM_INST_CNTS] = {0};
int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
@@ -292,8 +313,8 @@ private:
int32_t LastFlat[NUM_INST_CNTS] = {0};
// wait_cnt scores for every vgpr.
// Keep track of the VgprUB and SgprUB to make merge at join efficient.
- int32_t VgprUB;
- int32_t SgprUB;
+ int32_t VgprUB = 0;
+ int32_t SgprUB = 0;
int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
// Wait cnt scores for every sgpr, only lgkmcnt is relevant.
int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
@@ -306,38 +327,36 @@ private:
// at the end of the loop footer.
class LoopWaitcntData {
public:
+ LoopWaitcntData() = default;
+ ~LoopWaitcntData() = default;
+
void incIterCnt() { IterCnt++; }
void resetIterCnt() { IterCnt = 0; }
int32_t getIterCnt() { return IterCnt; }
- LoopWaitcntData() : LfWaitcnt(NULL), IterCnt(0) {}
- ~LoopWaitcntData(){};
-
void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
MachineInstr *getWaitcnt() const { return LfWaitcnt; }
void print() {
DEBUG(dbgs() << " iteration " << IterCnt << '\n';);
- return;
}
private:
// s_waitcnt added at the end of loop footer to stablize wait scores
// at the end of the loop footer.
- MachineInstr *LfWaitcnt;
+ MachineInstr *LfWaitcnt = nullptr;
// Number of iterations the loop has been visited, not including the initial
// walk over.
- int32_t IterCnt;
+ int32_t IterCnt = 0;
};
class SIInsertWaitcnts : public MachineFunctionPass {
-
private:
- const SISubtarget *ST;
- const SIInstrInfo *TII;
- const SIRegisterInfo *TRI;
- const MachineRegisterInfo *MRI;
- const MachineLoopInfo *MLI;
+ const SISubtarget *ST = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ const SIRegisterInfo *TRI = nullptr;
+ const MachineRegisterInfo *MRI = nullptr;
+ const MachineLoopInfo *MLI = nullptr;
AMDGPU::IsaInfo::IsaVersion IV;
AMDGPUAS AMDGPUASI;
@@ -357,9 +376,7 @@ private:
public:
static char ID;
- SIInsertWaitcnts()
- : MachineFunctionPass(ID), ST(nullptr), TII(nullptr), TRI(nullptr),
- MRI(nullptr), MLI(nullptr) {}
+ SIInsertWaitcnts() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -376,9 +393,11 @@ public:
void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
// The waitcnt information is copied because it changes as the block is
// traversed.
- KillWaitBrackets.push_back(make_unique<BlockWaitcntBrackets>(*Bracket));
+ KillWaitBrackets.push_back(
+ llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
}
+ bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
BlockWaitcntBrackets *ScoreBrackets);
void updateEventWaitCntAfter(MachineInstr &Inst,
@@ -389,7 +408,7 @@ public:
void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
};
-} // End anonymous namespace.
+} // end anonymous namespace
RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
const SIInstrInfo *TII,
@@ -567,13 +586,13 @@ void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
#if 0 // TODO: check if this is handled by MUBUF code above.
} else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
- Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
- Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
+ Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
+ Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
unsigned OpNo;//TODO: find the OpNo for this operand;
RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
for (signed RegNo = Interval.first; RegNo < Interval.second;
- ++RegNo) {
+ ++RegNo) {
setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
}
#endif
@@ -642,7 +661,6 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) {
OS << '\n';
}
OS << '\n';
- return;
}
unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
@@ -860,7 +878,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
switch (src_type) {
case SCMEM_LDS:
if (group_is_multi_wave ||
- context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
+ context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
ScoreBrackets->getScoreUB(LGKM_CNT));
// LDS may have to wait for VM_CNT after buffer load to LDS
@@ -874,9 +892,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
case SCMEM_GDS:
if (group_is_multi_wave || fence_is_global) {
EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
- ScoreBrackets->getScoreUB(EXP_CNT));
+ ScoreBrackets->getScoreUB(EXP_CNT));
EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
- ScoreBrackets->getScoreUB(LGKM_CNT));
+ ScoreBrackets->getScoreUB(LGKM_CNT));
}
break;
@@ -886,9 +904,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
case SCMEM_SCATTER:
if (group_is_multi_wave || fence_is_global) {
EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
- ScoreBrackets->getScoreUB(EXP_CNT));
+ ScoreBrackets->getScoreUB(EXP_CNT));
EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
- ScoreBrackets->getScoreUB(VM_CNT));
+ ScoreBrackets->getScoreUB(VM_CNT));
}
break;
@@ -927,13 +945,14 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
// before the call.
if (MI.getOpcode() == SC_CALL) {
if (ScoreBrackets->getScoreUB(EXP_CNT) >
- ScoreBrackets->getScoreLB(EXP_CNT)) {
+ ScoreBrackets->getScoreLB(EXP_CNT)) {
ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
EmitSwaitcnt |= CNT_MASK(EXP_CNT);
}
}
#endif
+ // FIXME: Should not be relying on memoperands.
// Look at the source operands of every instruction to see if
// any of them results from a previous memory operation that affects
// its current usage. If so, an s_waitcnt instruction needs to be
@@ -949,6 +968,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
EmitSwaitcnt |= ScoreBrackets->updateByWait(
VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
}
+
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
const MachineOperand &Op = MI.getOperand(I);
const MachineRegisterInfo &MRIA = *MRI;
@@ -973,6 +993,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
// 2) If a destination operand that was used by a recent export/store ins,
// add s_waitcnt on exp_cnt to guarantee the WAR order.
if (MI.mayStore()) {
+ // FIXME: Should not be relying on memoperands.
for (const MachineMemOperand *Memop : MI.memoperands()) {
unsigned AS = Memop->getAddrSpace();
if (AS != AMDGPUASI.LOCAL_ADDRESS)
@@ -1094,7 +1115,8 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
BlockWaitcntBracketsMap[TBB].get();
if (!ScoreBracket) {
assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
- BlockWaitcntBracketsMap[TBB] = make_unique<BlockWaitcntBrackets>();
+ BlockWaitcntBracketsMap[TBB] =
+ llvm::make_unique<BlockWaitcntBrackets>();
ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
}
ScoreBracket->setRevisitLoop(true);
@@ -1141,8 +1163,21 @@ void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
} else {
MBB.push_back(Waitcnt);
}
+}
+
+// This is a flat memory operation. Check to see if it has memory
+// tokens for both LDS and Memory, and if so mark it as a flat.
+bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
+ if (MI.memoperands_empty())
+ return true;
- return;
+ for (const MachineMemOperand *Memop : MI.memoperands()) {
+ unsigned AS = Memop->getAddrSpace();
+ if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
+ return true;
+ }
+
+ return false;
}
void SIInsertWaitcnts::updateEventWaitCntAfter(
@@ -1151,10 +1186,8 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
// instruction, update the upper-bound of the appropriate counter's
// bracket and the destination operand scores.
// TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
- uint64_t TSFlags = Inst.getDesc().TSFlags;
- if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) {
- if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) &&
- TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) {
+ if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
+ if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
} else {
@@ -1162,23 +1195,18 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
}
} else if (TII->isFLAT(Inst)) {
assert(Inst.mayLoad() || Inst.mayStore());
- ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
- ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
- // This is a flat memory operation. Check to see if it has memory
- // tokens for both LDS and Memory, and if so mark it as a flat.
- bool FoundLDSMem = false;
- for (const MachineMemOperand *Memop : Inst.memoperands()) {
- unsigned AS = Memop->getAddrSpace();
- if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
- FoundLDSMem = true;
- }
+ if (TII->usesVM_CNT(Inst))
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+
+ if (TII->usesLGKM_CNT(Inst)) {
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
- // This is a flat memory operation, so note it - it will require
- // that both the VM and LGKM be flushed to zero if it is pending when
- // a VM or LGKM dependency occurs.
- if (FoundLDSMem) {
- ScoreBrackets->setPendingFlat();
+ // This is a flat memory operation, so note it - it will require
+ // that both the VM and LGKM be flushed to zero if it is pending when
+ // a VM or LGKM dependency occurs.
+ if (mayAccessLDSThroughFlat(Inst))
+ ScoreBrackets->setPendingFlat();
}
} else if (SIInstrInfo::isVMEM(Inst) &&
// TODO: get a better carve out.
@@ -1241,7 +1269,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
BlockWaitcntBracketsMap[pred].get();
bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
- break;
+ continue;
}
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
T = (enum InstCounterType)(T + 1)) {
@@ -1280,7 +1308,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
BlockWaitcntBracketsMap[Pred].get();
bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
- break;
+ continue;
}
int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
@@ -1327,7 +1355,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
// Set the register scoreboard.
for (MachineBasicBlock *Pred : Block.predecessors()) {
if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
- break;
+ continue;
}
BlockWaitcntBrackets *PredScoreBrackets =
@@ -1441,7 +1469,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
// the delayed nature of these operations.
for (MachineBasicBlock *Pred : Block.predecessors()) {
if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
- break;
+ continue;
}
BlockWaitcntBrackets *PredScoreBrackets =
@@ -1494,8 +1522,6 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
ScoreBrackets->dump();
});
- bool InsertNOP = false;
-
// Walk over the instructions.
for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
Iter != E;) {
@@ -1555,7 +1581,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
if (RequireCheckResourceType(Inst, context)) {
// Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
ScoreBrackets->setScoreLB(VM_CNT,
- ScoreBrackets->getScoreUB(VM_CNT));
+ ScoreBrackets->getScoreUB(VM_CNT));
}
#endif
@@ -1596,58 +1622,6 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
VCCZBugHandledSet.insert(&Inst);
}
- if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
-
- // This avoids a s_nop after a waitcnt has just been inserted.
- if (!SWaitInst && InsertNOP) {
- BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
- }
- InsertNOP = false;
-
- // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
- // or SMEM clause, respectively.
- //
- // The temporary workaround is to break the clauses with S_NOP.
- //
- // The proper solution would be to allocate registers such that all source
- // and destination registers don't overlap, e.g. this is illegal:
- // r0 = load r2
- // r2 = load r0
- bool IsSMEM = false;
- bool IsVMEM = false;
- if (TII->isSMRD(Inst))
- IsSMEM = true;
- else if (TII->usesVM_CNT(Inst))
- IsVMEM = true;
-
- ++Iter;
- if (Iter == E)
- break;
-
- MachineInstr &Next = *Iter;
-
- // TODO: How about consecutive SMEM instructions?
- // The comments above says break the clause but the code does not.
- // if ((TII->isSMRD(next) && isSMEM) ||
- if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM &&
- // TODO: Enable this check when hasSoftClause is upstreamed.
- // ST->hasSoftClauses() &&
- ST->isXNACKEnabled()) {
- // Insert a NOP to break the clause.
- InsertNOP = true;
- continue;
- }
-
- // There must be "S_NOP 0" between an instruction writing M0 and
- // S_SENDMSG.
- if ((Next.getOpcode() == AMDGPU::S_SENDMSG ||
- Next.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
- Inst.definesRegister(AMDGPU::M0))
- InsertNOP = true;
-
- continue;
- }
-
++Iter;
}
@@ -1752,13 +1726,13 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
if (!ScoreBrackets) {
- BlockWaitcntBracketsMap[&MBB] = make_unique<BlockWaitcntBrackets>();
+ BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
}
ScoreBrackets->setPostOrder(MBB.getNumber());
MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
- LoopWaitcntDataMap[ContainingLoop] = make_unique<LoopWaitcntData>();
+ LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
// If we are walking into the block from before the loop, then guarantee
// at least 1 re-walk over the loop to propagate the information, even if
@@ -1819,12 +1793,10 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
++BI) {
-
MachineBasicBlock &MBB = *BI;
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
++I) {
-
if (!HaveScalarStores && TII->isScalarStore(*I))
HaveScalarStores = true;
@@ -1847,7 +1819,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
++I) {
-
if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
SeenDCacheWB = true;
else if (TII->isScalarStore(*I))
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
index bc86515d8b1f..b074b95c2d3c 100644
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -1,4 +1,4 @@
-//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//===- SILowerControlFlow.cpp - Use predicates for control flow -----------===//
//
// The LLVM Compiler Infrastructure
//
@@ -33,15 +33,14 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetRegisterInfo.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstring>
-#include <new>
#include <utility>
#define DEBUG_TYPE "si-insert-waits"
@@ -51,23 +50,23 @@ using namespace llvm;
namespace {
/// \brief One variable for each of the hardware counters
-typedef union {
+using Counters = union {
struct {
unsigned VM;
unsigned EXP;
unsigned LGKM;
} Named;
unsigned Array[3];
-} Counters;
+};
-typedef enum {
+using InstType = enum {
OTHER,
SMEM,
VMEM
-} InstType;
+};
-typedef Counters RegCounters[512];
-typedef std::pair<unsigned, unsigned> RegInterval;
+using RegCounters = Counters[512];
+using RegInterval = std::pair<unsigned, unsigned>;
class SIInsertWaits : public MachineFunctionPass {
private:
@@ -409,7 +408,6 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
// Adjust the value to the real hardware possibilities.
Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
-
} else
Counts.Array[i] = 0;
@@ -568,12 +566,10 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
-
MachineBasicBlock &MBB = *BI;
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {
-
if (!HaveScalarStores && TII->isScalarStore(*I))
HaveScalarStores = true;
@@ -671,7 +667,6 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
I != E; ++I) {
-
if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
SeenDCacheWB = true;
else if (TII->isScalarStore(*I))
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 02c9b4b1f0ee..25917cc06e6a 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -11,9 +11,18 @@
//
//===----------------------------------------------------------------------===//
+def isGCN : Predicate<"Subtarget->getGeneration() "
+ ">= SISubtarget::SOUTHERN_ISLANDS">,
+ AssemblerPredicate<"FeatureGCN">;
+def isSI : Predicate<"Subtarget->getGeneration() "
+ "== SISubtarget::SOUTHERN_ISLANDS">,
+ AssemblerPredicate<"FeatureSouthernIslands">;
+
+
class InstSI <dag outs, dag ins, string asm = "",
list<dag> pattern = []> :
AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+ let SubtargetPredicate = isGCN;
// Low bits - basic encoding information.
field bit SALU = 0;
@@ -45,7 +54,7 @@ class InstSI <dag outs, dag ins, string asm = "",
field bit FLAT = 0;
field bit DS = 0;
- // Pseudo instruction formats.
+ // Pseudo instruction formats.
field bit VGPRSpill = 0;
field bit SGPRSpill = 0;
@@ -79,10 +88,36 @@ class InstSI <dag outs, dag ins, string asm = "",
// is unable to infer the encoding from the operands.
field bit VOPAsmPrefer32Bit = 0;
+ // This bit indicates that this is a VOP3 opcode which supports op_sel
+ // modifier (gfx9 only).
+ field bit VOP3_OPSEL = 0;
+
+ // Is it possible for this instruction to be atomic?
+ field bit maybeAtomic = 0;
+
+ // This bit indicates that this is a VI instruction which is renamed
+ // in GFX9. Required for correct mapping from pseudo to MC.
+ field bit renamedInGFX9 = 0;
+
// This bit indicates that this has a floating point result type, so
// the clamp modifier has floating point semantics.
field bit FPClamp = 0;
+ // This bit indicates that instruction may support integer clamping
+ // which depends on GPU features.
+ field bit IntClamp = 0;
+
+ // This field indicates that the clamp applies to the low component
+ // of a packed output register.
+ field bit ClampLo = 0;
+
+ // This field indicates that the clamp applies to the high component
+ // of a packed output register.
+ field bit ClampHi = 0;
+
+ // This bit indicates that this is a packed VOP3P instruction
+ field bit IsPacked = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -126,7 +161,17 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{39} = ScalarStore;
let TSFlags{40} = FixedSize;
let TSFlags{41} = VOPAsmPrefer32Bit;
- let TSFlags{42} = FPClamp;
+ let TSFlags{42} = VOP3_OPSEL;
+
+ let TSFlags{43} = maybeAtomic;
+ let TSFlags{44} = renamedInGFX9;
+
+ let TSFlags{45} = FPClamp;
+ let TSFlags{46} = IntClamp;
+ let TSFlags{47} = ClampLo;
+ let TSFlags{48} = ClampHi;
+
+ let TSFlags{49} = IsPacked;
let SchedRW = [Write32Bit];
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index a7e0feb10b9f..61967605432e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===//
+//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -13,19 +13,52 @@
//===----------------------------------------------------------------------===//
#include "SIInstrInfo.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "GCNHazardRecognizer.h"
#include "SIDefines.h"
#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
using namespace llvm;
@@ -305,26 +338,77 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
}
if (isFLAT(LdSt)) {
- const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
- BaseReg = AddrReg->getReg();
- Offset = 0;
+ const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ if (VAddr) {
+ // Can't analyze 2 offsets.
+ if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
+ return false;
+
+ BaseReg = VAddr->getReg();
+ } else {
+ // scratch instructions have either vaddr or saddr.
+ BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg();
+ }
+
+ Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
return true;
}
return false;
}
+static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
+ const MachineInstr &MI2, unsigned BaseReg2) {
+ if (BaseReg1 == BaseReg2)
+ return true;
+
+ if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
+ return false;
+
+ auto MO1 = *MI1.memoperands_begin();
+ auto MO2 = *MI2.memoperands_begin();
+ if (MO1->getAddrSpace() != MO2->getAddrSpace())
+ return false;
+
+ auto Base1 = MO1->getValue();
+ auto Base2 = MO2->getValue();
+ if (!Base1 || !Base2)
+ return false;
+ const MachineFunction &MF = *MI1.getParent()->getParent();
+ const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
+ Base1 = GetUnderlyingObject(Base1, DL);
+ Base2 = GetUnderlyingObject(Base1, DL);
+
+ if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
+ return false;
+
+ return Base1 == Base2;
+}
+
bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
+ unsigned BaseReg1,
MachineInstr &SecondLdSt,
+ unsigned BaseReg2,
unsigned NumLoads) const {
+ if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2))
+ return false;
+
const MachineOperand *FirstDst = nullptr;
const MachineOperand *SecondDst = nullptr;
if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
(isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
(isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
+ const unsigned MaxGlobalLoadCluster = 6;
+ if (NumLoads > MaxGlobalLoadCluster)
+ return false;
+
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
+ if (!FirstDst)
+ FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
+ if (!SecondDst)
+ SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
} else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
@@ -358,10 +442,10 @@ static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
const DebugLoc &DL, unsigned DestReg,
unsigned SrcReg, bool KillSrc) {
MachineFunction *MF = MBB.getParent();
- DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(),
+ DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
"illegal SGPR to VGPR copy",
DL, DS_Error);
- LLVMContext &C = MF->getFunction()->getContext();
+ LLVMContext &C = MF->getFunction().getContext();
C.diagnose(IllegalCopy);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
@@ -452,7 +536,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
}
-
ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
@@ -566,15 +649,18 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
"Not a VGPR32 reg");
if (Cond.size() == 1) {
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
+ .add(Cond[0]);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addReg(FalseReg)
.addReg(TrueReg)
- .add(Cond[0]);
+ .addReg(SReg);
} else if (Cond.size() == 2) {
assert(Cond[0].isImm() && "Cond[0] is not an immediate");
switch (Cond[0].getImm()) {
case SIInstrInfo::SCC_TRUE: {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
.addImm(-1)
.addImm(0);
@@ -585,7 +671,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
break;
}
case SIInstrInfo::SCC_FALSE: {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
.addImm(0)
.addImm(-1);
@@ -598,23 +684,29 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
case SIInstrInfo::VCCNZ: {
MachineOperand RegOp = Cond[1];
RegOp.setImplicit(false);
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
+ .add(RegOp);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addReg(FalseReg)
.addReg(TrueReg)
- .add(RegOp);
+ .addReg(SReg);
break;
}
case SIInstrInfo::VCCZ: {
MachineOperand RegOp = Cond[1];
RegOp.setImplicit(false);
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
+ .add(RegOp);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addReg(TrueReg)
.addReg(FalseReg)
- .add(RegOp);
+ .addReg(SReg);
break;
}
case SIInstrInfo::EXECNZ: {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
.addImm(0);
@@ -628,7 +720,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
break;
}
case SIInstrInfo::EXECZ: {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
.addImm(0);
@@ -735,6 +827,10 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
+ assert(SrcReg != MFI->getStackPtrOffsetReg() &&
+ SrcReg != MFI->getFrameOffsetReg() &&
+ SrcReg != MFI->getScratchWaveOffsetReg());
+
unsigned Size = FrameInfo.getObjectSize(FrameIndex);
unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
MachinePointerInfo PtrInfo
@@ -768,6 +864,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// needing them, and need to ensure that the reserved registers are
// correctly handled.
+ FrameInfo.setStackID(FrameIndex, 1);
if (ST.hasScalarStores()) {
// m0 is used for offset to scalar stores if used to spill.
Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
@@ -776,8 +873,8 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
return;
}
- if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
- LLVMContext &Ctx = MF->getFunction()->getContext();
+ if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
+ LLVMContext &Ctx = MF->getFunction().getContext();
Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
" spill register");
BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
@@ -863,6 +960,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
}
+ FrameInfo.setStackID(FrameIndex, 1);
MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
@@ -877,8 +975,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
return;
}
- if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
- LLVMContext &Ctx = MF->getFunction()->getContext();
+ if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
+ LLVMContext &Ctx = MF->getFunction().getContext();
Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
" restore register");
BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
@@ -904,7 +1002,6 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
unsigned WavefrontSize = ST.getWavefrontSize();
@@ -920,17 +1017,16 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
if (TIDReg == AMDGPU::NoRegister)
return TIDReg;
- if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) &&
+ if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) &&
WorkGroupSize > WavefrontSize) {
-
unsigned TIDIGXReg
- = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
+ = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
unsigned TIDIGYReg
- = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
+ = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
unsigned TIDIGZReg
- = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
+ = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
unsigned InputPtrReg =
- TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
if (!Entry.isLiveIn(Reg))
Entry.addLiveIn(Reg);
@@ -961,9 +1057,9 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
.addReg(TIDIGYReg)
.addReg(TIDReg);
// (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
- BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
- .addReg(TIDReg)
- .addReg(TIDIGZReg);
+ getAddNoCarry(Entry, Insert, DL, TIDReg)
+ .addReg(TIDReg)
+ .addReg(TIDIGZReg);
} else {
// Get the wave id
BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
@@ -986,9 +1082,9 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
// Add FrameIndex to LDS offset
unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
- BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
- .addImm(LDSOffset)
- .addReg(TIDReg);
+ getAddNoCarry(MBB, MI, DL, TmpReg)
+ .addImm(LDSOffset)
+ .addReg(TIDReg);
return TmpReg;
}
@@ -1042,24 +1138,24 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
DebugLoc DL = MBB.findDebugLoc(MI);
switch (MI.getOpcode()) {
default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
- case AMDGPU::S_MOV_B64_term: {
+ case AMDGPU::S_MOV_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_MOV_B64));
break;
- }
- case AMDGPU::S_XOR_B64_term: {
+
+ case AMDGPU::S_XOR_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_XOR_B64));
break;
- }
- case AMDGPU::S_ANDN2_B64_term: {
+
+ case AMDGPU::S_ANDN2_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_ANDN2_B64));
break;
- }
+
case AMDGPU::V_MOV_B64_PSEUDO: {
unsigned Dst = MI.getOperand(0).getReg();
unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -1088,6 +1184,28 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
+ case AMDGPU::V_SET_INACTIVE_B32: {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
+ .add(MI.getOperand(2));
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC);
+ MI.eraseFromParent();
+ break;
+ }
+ case AMDGPU::V_SET_INACTIVE_B64: {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC);
+ MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
+ MI.getOperand(0).getReg())
+ .add(MI.getOperand(2));
+ expandPostRAPseudo(*Copy);
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC);
+ MI.eraseFromParent();
+ break;
+ }
case AMDGPU::V_MOVRELD_B32_V1:
case AMDGPU::V_MOVRELD_B32_V2:
case AMDGPU::V_MOVRELD_B32_V4:
@@ -1140,11 +1258,17 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MIB.add(MI.getOperand(2));
Bundler.append(MIB);
- llvm::finalizeBundle(MBB, Bundler.begin());
+ finalizeBundle(MBB, Bundler.begin());
MI.eraseFromParent();
break;
}
+ case AMDGPU::EXIT_WWM: {
+ // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
+ // is exited.
+ MI.setDesc(get(AMDGPU::S_MOV_B64));
+ break;
+ }
}
return true;
}
@@ -1232,7 +1356,6 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return nullptr;
}
-
if (CommutedMI) {
swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
Src1, AMDGPU::OpName::src1_modifiers);
@@ -1542,7 +1665,6 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded) const {
-
if (!FBB && Cond.empty()) {
BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
.addMBB(TBB);
@@ -1760,6 +1882,23 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
}
}
+unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
+ PseudoSourceValue::PSVKind Kind) const {
+ switch(Kind) {
+ case PseudoSourceValue::Stack:
+ case PseudoSourceValue::FixedStack:
+ return AMDGPUASI.PRIVATE_ADDRESS;
+ case PseudoSourceValue::ConstantPool:
+ case PseudoSourceValue::GOT:
+ case PseudoSourceValue::JumpTable:
+ case PseudoSourceValue::GlobalValueCallEntry:
+ case PseudoSourceValue::ExternalSymbolCallEntry:
+ case PseudoSourceValue::TargetCustom:
+ return AMDGPUASI.CONSTANT_ADDRESS;
+ }
+ return AMDGPUASI.FLAT_ADDRESS;
+}
+
static void removeModOperands(MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
@@ -1779,28 +1918,29 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (!MRI->hasOneNonDBGUse(Reg))
return false;
+ switch (DefMI.getOpcode()) {
+ default:
+ return false;
+ case AMDGPU::S_MOV_B64:
+ // TODO: We could fold 64-bit immediates, but this get compilicated
+ // when there are sub-registers.
+ return false;
+
+ case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::S_MOV_B32:
+ break;
+ }
+
+ const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
+ assert(ImmOp);
+ // FIXME: We could handle FrameIndex values here.
+ if (!ImmOp->isImm())
+ return false;
+
unsigned Opc = UseMI.getOpcode();
if (Opc == AMDGPU::COPY) {
bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
- switch (DefMI.getOpcode()) {
- default:
- return false;
- case AMDGPU::S_MOV_B64:
- // TODO: We could fold 64-bit immediates, but this get compilicated
- // when there are sub-registers.
- return false;
-
- case AMDGPU::V_MOV_B32_e32:
- case AMDGPU::S_MOV_B32:
- break;
- }
unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
- const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
- assert(ImmOp);
- // FIXME: We could handle FrameIndex values here.
- if (!ImmOp->isImm()) {
- return false;
- }
UseMI.setDesc(get(NewOpc));
UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
@@ -1814,15 +1954,13 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (hasAnyModifiersSet(UseMI))
return false;
- const MachineOperand &ImmOp = DefMI.getOperand(1);
-
// If this is a free constant, there's no reason to do this.
// TODO: We could fold this here instead of letting SIFoldOperands do it
// later.
MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
// Any src operand can be used for the legality check.
- if (isInlineConstant(UseMI, *Src0, ImmOp))
+ if (isInlineConstant(UseMI, *Src0, *ImmOp))
return false;
bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
@@ -1840,7 +1978,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// We need to swap operands 0 and 1 since madmk constant is at operand 1.
- const int64_t Imm = DefMI.getOperand(1).getImm();
+ const int64_t Imm = ImmOp->getImm();
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
@@ -1885,7 +2023,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
return false;
- const int64_t Imm = DefMI.getOperand(1).getImm();
+ const int64_t Imm = ImmOp->getImm();
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
@@ -1985,7 +2123,7 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
if (isDS(MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
- return !isFLAT(MIb);
+ return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
}
if (isMUBUF(MIa) || isMTBUF(MIa)) {
@@ -2012,6 +2150,18 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
return false;
}
+static int64_t getFoldableImm(const MachineOperand* MO) {
+ if (!MO->isReg())
+ return false;
+ const MachineFunction *MF = MO->getParent()->getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ auto Def = MRI.getUniqueVRegDef(MO->getReg());
+ if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+ Def->getOperand(1).isImm())
+ return Def->getOperand(1).getImm();
+ return AMDGPU::NoRegister;
+}
+
MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
MachineInstr &MI,
LiveVariables *LV) const {
@@ -2032,8 +2182,12 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::src0);
const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
+ if (!Src0->isReg() && !Src0->isImm())
+ return nullptr;
+
if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
return nullptr;
+
break;
}
}
@@ -2049,6 +2203,37 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
+ if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
+ // If we have an SGPR input, we will violate the constant bus restriction.
+ (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
+ if (auto Imm = getFoldableImm(Src2)) {
+ return BuildMI(*MBB, MI, MI.getDebugLoc(),
+ get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
+ .add(*Dst)
+ .add(*Src0)
+ .add(*Src1)
+ .addImm(Imm);
+ }
+ if (auto Imm = getFoldableImm(Src1)) {
+ return BuildMI(*MBB, MI, MI.getDebugLoc(),
+ get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
+ .add(*Dst)
+ .add(*Src0)
+ .addImm(Imm)
+ .add(*Src2);
+ }
+ if (auto Imm = getFoldableImm(Src0)) {
+ if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
+ AMDGPU::OpName::src0), Src1))
+ return BuildMI(*MBB, MI, MI.getDebugLoc(),
+ get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
+ .add(*Dst)
+ .add(*Src1)
+ .addImm(Imm)
+ .add(*Src2);
+ }
+ }
+
return BuildMI(*MBB, MI, MI.getDebugLoc(),
get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
.add(*Dst)
@@ -2133,10 +2318,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
- case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
return AMDGPU::isInlinableLiteral64(MO.getImm(),
ST.hasInv2PiInlineImm());
- }
case AMDGPU::OPERAND_REG_IMM_INT16:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
@@ -2439,7 +2623,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
// Verify SDWA
if (isSDWA(MI)) {
-
if (!ST.hasSDWA()) {
ErrInfo = "SDWA is not supported on this target";
return false;
@@ -2504,6 +2687,28 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
}
+
+ const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+ if (DstUnused && DstUnused->isImm() &&
+ DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
+ const MachineOperand &Dst = MI.getOperand(DstIdx);
+ if (!Dst.isReg() || !Dst.isTied()) {
+ ErrInfo = "Dst register should have tied register";
+ return false;
+ }
+
+ const MachineOperand &TiedMO =
+ MI.getOperand(MI.findTiedOperandIdx(DstIdx));
+ if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
+ ErrInfo =
+ "Dst register should be tied to implicit use of preserved register";
+ return false;
+ } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
+ Dst.getReg() != TiedMO.getReg()) {
+ ErrInfo = "Dst register should use same physical register as preserved";
+ return false;
+ }
+ }
}
// Verify VOP*
@@ -2648,21 +2853,30 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
return true;
}
-unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
+unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default: return AMDGPU::INSTRUCTION_LIST_END;
case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
case AMDGPU::COPY: return AMDGPU::COPY;
case AMDGPU::PHI: return AMDGPU::PHI;
case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
+ case AMDGPU::WQM: return AMDGPU::WQM;
+ case AMDGPU::WWM: return AMDGPU::WWM;
case AMDGPU::S_MOV_B32:
return MI.getOperand(1).isReg() ?
AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
case AMDGPU::S_ADD_I32:
- case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
- case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
+ return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
+ case AMDGPU::S_ADDC_U32:
+ return AMDGPU::V_ADDC_U32_e32;
case AMDGPU::S_SUB_I32:
- case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
+ return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
+ // FIXME: These are not consistently handled, and selected when the carry is
+ // used.
+ case AMDGPU::S_ADD_U32:
+ return AMDGPU::V_ADD_I32_e32;
+ case AMDGPU::S_SUB_U32:
+ return AMDGPU::V_SUB_I32_e32;
case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
@@ -2709,10 +2923,6 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
}
}
-bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
- return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
-}
-
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
unsigned OpNo) const {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
@@ -3090,7 +3300,6 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
MachineOperand &Op,
MachineRegisterInfo &MRI,
const DebugLoc &DL) const {
-
unsigned OpReg = Op.getReg();
unsigned OpSubReg = Op.getSubReg();
@@ -3235,7 +3444,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
// scratch memory access. In both cases, the legalization never involves
// conversion to the addr64 form.
if (isMIMG(MI) ||
- (AMDGPU::isShader(MF.getFunction()->getCallingConv()) &&
+ (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
(isMUBUF(MI) || isMTBUF(MI)))) {
MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
@@ -3423,6 +3632,19 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
switch (Opcode) {
default:
break;
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO:
+ splitScalar64BitAddSub(Worklist, Inst);
+ Inst.eraseFromParent();
+ continue;
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_SUB_I32:
+ // FIXME: The u32 versions currently selected use the carry.
+ if (moveScalarAddSub(Worklist, Inst))
+ continue;
+
+ // Default handling
+ break;
case AMDGPU::S_AND_B64:
splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
Inst.eraseFromParent();
@@ -3448,11 +3670,10 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
Inst.eraseFromParent();
continue;
- case AMDGPU::S_BFE_I64: {
+ case AMDGPU::S_BFE_I64:
splitScalar64BitBFE(Worklist, Inst);
Inst.eraseFromParent();
continue;
- }
case AMDGPU::S_LSHL_B32:
if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
@@ -3511,10 +3732,78 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
case AMDGPU::S_PACK_LL_B32_B16:
case AMDGPU::S_PACK_LH_B32_B16:
- case AMDGPU::S_PACK_HH_B32_B16: {
+ case AMDGPU::S_PACK_HH_B32_B16:
movePackToVALU(Worklist, MRI, Inst);
Inst.eraseFromParent();
continue;
+
+ case AMDGPU::S_XNOR_B32:
+ lowerScalarXnor(Worklist, Inst);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_XNOR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: {
+ unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
+ auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
+ unsigned Offset = 0;
+
+ // FIXME: This isn't safe because the addressing mode doesn't work
+ // correctly if vaddr is negative.
+ //
+ // FIXME: Handle v_add_u32 and VOP3 form. Also don't rely on immediate
+ // being in src0.
+ //
+ // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
+ //
+ // See if we can extract an immediate offset by recognizing one of these:
+ // V_ADD_I32_e32 dst, imm, src1
+ // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
+ // V_ADD will be removed by "Remove dead machine instructions".
+ if (Add && Add->getOpcode() == AMDGPU::V_ADD_I32_e32) {
+ const MachineOperand *Src =
+ getNamedOperand(*Add, AMDGPU::OpName::src0);
+
+ if (Src->isReg()) {
+ auto Mov = MRI.getUniqueVRegDef(Src->getReg());
+ if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
+ Src = &Mov->getOperand(1);
+ }
+
+ if (Src) {
+ if (Src->isImm())
+ Offset = Src->getImm();
+ else if (Src->isCImm())
+ Offset = Src->getCImm()->getZExtValue();
+ }
+
+ if (Offset && isLegalMUBUFImmOffset(Offset))
+ VAddr = getNamedOperand(*Add, AMDGPU::OpName::src1);
+ else
+ Offset = 0;
+ }
+
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(),
+ get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
+ .add(*VAddr) // vaddr
+ .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
+ .addImm(0) // soffset
+ .addImm(Offset) // offset
+ .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end());
+
+ MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
+ VDst);
+ addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
+ Inst.eraseFromParent();
+ continue;
}
}
@@ -3610,6 +3899,41 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
}
}
+// Add/sub require special handling to deal with carry outs.
+bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist,
+ MachineInstr &Inst) const {
+ if (ST.hasAddNoCarry()) {
+ // Assume there is no user of scc since we don't select this in that case.
+ // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
+ // is used.
+
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ unsigned OldDstReg = Inst.getOperand(0).getReg();
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ unsigned Opc = Inst.getOpcode();
+ assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
+
+ unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
+ AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
+
+ assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
+ Inst.RemoveOperand(3);
+
+ Inst.setDesc(get(NewOpc));
+ Inst.addImplicitDefUseOperands(*MBB.getParent());
+ MRI.replaceRegWith(OldDstReg, ResultReg);
+ legalizeOperands(Inst);
+
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+ return true;
+ }
+
+ return false;
+}
+
void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -3622,7 +3946,10 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
+ unsigned SubOp = ST.hasAddNoCarry() ?
+ AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
+
+ BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
.addImm(0)
.addReg(Src.getReg());
@@ -3634,6 +3961,33 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
+void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
+ MachineInstr &Inst) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ const DebugLoc &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+
+ legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
+ legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
+
+ unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
+ .add(Src0)
+ .add(Src1);
+
+ unsigned Not = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), Not)
+ .addReg(Xor);
+
+ MRI.replaceRegWith(Dest.getReg(), Not);
+ addUsersToMoveToVALUWorklist(Not, MRI, Worklist);
+}
+
void SIInstrInfo::splitScalar64BitUnaryOp(
SetVectorType &Worklist, MachineInstr &Inst,
unsigned Opcode) const {
@@ -3685,6 +4039,74 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
+void SIInstrInfo::splitScalar64BitAddSub(
+ SetVectorType &Worklist, MachineInstr &Inst) const {
+ bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
+
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ const DebugLoc &DL = Inst.getDebugLoc();
+ MachineBasicBlock::iterator MII = Inst;
+
+ const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+ const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
+ const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+ const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
+
+ MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+ AMDGPU::sub0, Src0SubRC);
+ MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+ AMDGPU::sub0, Src1SubRC);
+
+
+ MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+ AMDGPU::sub1, Src0SubRC);
+ MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+ AMDGPU::sub1, Src1SubRC);
+
+ unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+ MachineInstr *LoHalf =
+ BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
+ .addReg(CarryReg, RegState::Define)
+ .add(SrcReg0Sub0)
+ .add(SrcReg1Sub0);
+
+ unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
+ MachineInstr *HiHalf =
+ BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
+ .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
+ .add(SrcReg0Sub1)
+ .add(SrcReg1Sub1)
+ .addReg(CarryReg, RegState::Kill);
+
+ BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+
+ MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+ // Try to legalize the operands in case we need to swap the order to keep it
+ // valid.
+ legalizeOperands(*LoHalf);
+ legalizeOperands(*HiHalf);
+
+ // Move all users of this moved vlaue.
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
void SIInstrInfo::splitScalar64BitBinaryOp(
SetVectorType &Worklist, MachineInstr &Inst,
unsigned Opcode) const {
@@ -3936,8 +4358,8 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(
// This assumes that all the users of SCC are in the same block
// as the SCC def.
for (MachineInstr &MI :
- llvm::make_range(MachineBasicBlock::iterator(SCCDefInst),
- SCCDefInst.getParent()->end())) {
+ make_range(MachineBasicBlock::iterator(SCCDefInst),
+ SCCDefInst.getParent()->end())) {
// Exit if we find another SCC def.
if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
return;
@@ -3959,6 +4381,8 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
case AMDGPU::PHI:
case AMDGPU::REG_SEQUENCE:
case AMDGPU::INSERT_SUBREG:
+ case AMDGPU::WQM:
+ case AMDGPU::WWM:
if (RI.hasVGPRs(NewDstRC))
return nullptr;
@@ -4123,7 +4547,6 @@ unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
-
if (!MI.mayLoad())
return AMDGPU::NoRegister;
@@ -4150,6 +4573,18 @@ unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
return AMDGPU::NoRegister;
}
+unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
+ unsigned Size = 0;
+ MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+ MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
+ while (++I != E && I->isInsideBundle()) {
+ assert(!I->isBundle() && "No nested bundle!");
+ Size += getInstSizeInBytes(*I);
+ }
+
+ return Size;
+}
+
unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
@@ -4193,9 +4628,10 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
case TargetOpcode::IMPLICIT_DEF:
case TargetOpcode::KILL:
case TargetOpcode::DBG_VALUE:
- case TargetOpcode::BUNDLE:
case TargetOpcode::EH_LABEL:
return 0;
+ case TargetOpcode::BUNDLE:
+ return getInstBundleSize(MI);
case TargetOpcode::INLINEASM: {
const MachineFunction *MF = MI.getParent()->getParent();
const char *AsmStr = MI.getOperand(0).getSymbolName();
@@ -4350,10 +4786,34 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL,
unsigned DestReg) const {
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ if (ST.hasAddNoCarry())
+ return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC);
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
.addReg(UnusedCarry, RegState::Define | RegState::Dead);
}
+
+bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
+ switch (Opcode) {
+ case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+ case AMDGPU::SI_KILL_I1_TERMINATOR:
+ return true;
+ default:
+ return false;
+ }
+}
+
+const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
+ switch (Opcode) {
+ case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
+ return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
+ case AMDGPU::SI_KILL_I1_PSEUDO:
+ return get(AMDGPU::SI_KILL_I1_TERMINATOR);
+ default:
+ llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
+ }
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 3dd5bc89e6c7..24ee843e6ade 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===//
+//===- SIInstrInfo.h - SI Instruction Info Interface ------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -12,17 +12,33 @@
//
//===----------------------------------------------------------------------===//
-
#ifndef LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
#define LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
#include "AMDGPUInstrInfo.h"
#include "SIDefines.h"
#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Compiler.h"
+#include <cassert>
+#include <cstdint>
namespace llvm {
+class APInt;
+class MachineRegisterInfo;
+class RegScavenger;
+class SISubtarget;
+class TargetRegisterClass;
+
class SIInstrInfo final : public AMDGPUInstrInfo {
private:
const SIRegisterInfo RI;
@@ -39,11 +55,12 @@ private:
EXECZ = 3
};
- typedef SmallSetVector<MachineInstr *, 32> SetVectorType;
+ using SetVectorType = SmallSetVector<MachineInstr *, 32>;
static unsigned getBranchOpcode(BranchPredicate Cond);
static BranchPredicate getBranchPredicate(unsigned Opcode);
+public:
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
MachineRegisterInfo &MRI,
MachineOperand &SuperReg,
@@ -56,15 +73,24 @@ private:
const TargetRegisterClass *SuperRC,
unsigned SubIdx,
const TargetRegisterClass *SubRC) const;
-
+private:
void swapOperands(MachineInstr &Inst) const;
+ bool moveScalarAddSub(SetVectorType &Worklist,
+ MachineInstr &Inst) const;
+
void lowerScalarAbs(SetVectorType &Worklist,
MachineInstr &Inst) const;
+ void lowerScalarXnor(SetVectorType &Worklist,
+ MachineInstr &Inst) const;
+
void splitScalar64BitUnaryOp(SetVectorType &Worklist,
MachineInstr &Inst, unsigned Opcode) const;
+ void splitScalar64BitAddSub(SetVectorType &Worklist,
+ MachineInstr &Inst) const;
+
void splitScalar64BitBinaryOp(SetVectorType &Worklist,
MachineInstr &Inst, unsigned Opcode) const;
@@ -76,9 +102,8 @@ private:
MachineRegisterInfo &MRI,
MachineInstr &Inst) const;
- void addUsersToMoveToVALUWorklist(
- unsigned Reg, MachineRegisterInfo &MRI,
- SetVectorType &Worklist) const;
+ void addUsersToMoveToVALUWorklist(unsigned Reg, MachineRegisterInfo &MRI,
+ SetVectorType &Worklist) const;
void
addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst,
@@ -101,7 +126,6 @@ protected:
unsigned OpIdx1) const override;
public:
-
enum TargetOperandFlags {
MO_MASK = 0x7,
@@ -120,7 +144,7 @@ public:
MO_REL32_HI = 5
};
- explicit SIInstrInfo(const SISubtarget &);
+ explicit SIInstrInfo(const SISubtarget &ST);
const SIRegisterInfo &getRegisterInfo() const {
return RI;
@@ -137,7 +161,8 @@ public:
int64_t &Offset,
const TargetRegisterInfo *TRI) const final;
- bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
+ bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
+ MachineInstr &SecondLdSt, unsigned BaseReg2,
unsigned NumLoads) const final;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
@@ -159,7 +184,7 @@ public:
unsigned insertNE(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
- unsigned SrcReg, int Value) const;
+ unsigned SrcReg, int Value) const;
unsigned insertEQ(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
@@ -228,7 +253,6 @@ public:
bool reverseBranchCondition(
SmallVectorImpl<MachineOperand> &Cond) const override;
-
bool canInsertSelect(const MachineBasicBlock &MBB,
ArrayRef<MachineOperand> Cond,
unsigned TrueReg, unsigned FalseReg,
@@ -245,6 +269,9 @@ public:
unsigned DstReg, ArrayRef<MachineOperand> Cond,
unsigned TrueReg, unsigned FalseReg) const;
+ unsigned getAddressSpaceForPseudoSourceKind(
+ PseudoSourceValue::PSVKind Kind) const override;
+
bool
areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
AliasAnalysis *AA = nullptr) const override;
@@ -392,6 +419,19 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::SMRD;
}
+ bool isBufferSMRD(const MachineInstr &MI) const {
+ if (!isSMRD(MI))
+ return false;
+
+ // Check that it is using a buffer resource.
+ int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
+ if (Idx == -1) // e.g. s_memtime
+ return false;
+
+ const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
+ return RCID == AMDGPU::SReg_128RegClassID;
+ }
+
static bool isDS(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::DS;
}
@@ -420,6 +460,14 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::FLAT;
}
+ // Is a FLAT encoded instruction which accesses a specific segment,
+ // i.e. global_* or scratch_*.
+ static bool isSegmentSpecificFLAT(const MachineInstr &MI) {
+ auto Flags = MI.getDesc().TSFlags;
+ return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT);
+ }
+
+ // Any FLAT encoded instruction, including global_* and scratch_*.
bool isFLAT(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}
@@ -496,6 +544,10 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::VM_CNT;
}
+ static bool usesLGKM_CNT(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::LGKM_CNT;
+ }
+
static bool sopkIsZext(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::SOPK_ZEXT;
}
@@ -523,11 +575,23 @@ public:
}
static bool hasFPClamp(const MachineInstr &MI) {
- return MI.getDesc().TSFlags & SIInstrFlags::HasFPClamp;
+ return MI.getDesc().TSFlags & SIInstrFlags::FPClamp;
}
bool hasFPClamp(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::HasFPClamp;
+ return get(Opcode).TSFlags & SIInstrFlags::FPClamp;
+ }
+
+ static bool hasIntClamp(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IntClamp;
+ }
+
+ uint64_t getClampMask(const MachineInstr &MI) const {
+ const uint64_t ClampFlags = SIInstrFlags::FPClamp |
+ SIInstrFlags::IntClamp |
+ SIInstrFlags::ClampLo |
+ SIInstrFlags::ClampHi;
+ return MI.getDesc().TSFlags & ClampFlags;
}
bool isVGPRCopy(const MachineInstr &MI) const {
@@ -630,9 +694,7 @@ public:
bool verifyInstruction(const MachineInstr &MI,
StringRef &ErrInfo) const override;
- static unsigned getVALUOp(const MachineInstr &MI);
-
- bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
+ unsigned getVALUOp(const MachineInstr &MI) const;
/// \brief Return the correct register class for \p OpNo. For target-specific
/// instructions, this will return the register class that has been defined
@@ -774,6 +836,7 @@ public:
unsigned isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
+ unsigned getInstBundleSize(const MachineInstr &MI) const;
unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const;
@@ -812,9 +875,17 @@ public:
MachineBasicBlock::iterator I,
const DebugLoc &DL,
unsigned DestReg) const;
+
+ static bool isKillTerminator(unsigned Opcode);
+ const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const;
+
+ static bool isLegalMUBUFImmOffset(unsigned Imm) {
+ return isUInt<12>(Imm);
+ }
};
namespace AMDGPU {
+
LLVM_READONLY
int getVOPe64(uint16_t Opcode);
@@ -855,7 +926,8 @@ namespace AMDGPU {
TF_LONG_BRANCH_FORWARD = 1 << 0,
TF_LONG_BRANCH_BACKWARD = 1 << 1
};
-} // End namespace AMDGPU
+
+} // end namespace AMDGPU
namespace SI {
namespace KernelInputOffsets {
@@ -873,9 +945,9 @@ enum Offsets {
LOCAL_SIZE_Z = 32
};
-} // End namespace KernelInputOffsets
-} // End namespace SI
+} // end namespace KernelInputOffsets
+} // end namespace SI
-} // End namespace llvm
+} // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 088173680fa8..fc2d35d873aa 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -11,6 +11,9 @@ def isCI : Predicate<"Subtarget->getGeneration() "
def isCIOnly : Predicate<"Subtarget->getGeneration() =="
"SISubtarget::SEA_ISLANDS">,
AssemblerPredicate <"FeatureSeaIslands">;
+def isVIOnly : Predicate<"Subtarget->getGeneration() =="
+ "SISubtarget::VOLCANIC_ISLANDS">,
+ AssemblerPredicate <"FeatureVolcanicIslands">;
def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
@@ -22,6 +25,7 @@ def SIEncodingFamily {
int VI = 1;
int SDWA = 2;
int SDWA9 = 3;
+ int GFX9 = 4;
}
//===----------------------------------------------------------------------===//
@@ -89,6 +93,53 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SDTBufferStore : SDTypeProfile<0, 6,
+ [ // vdata
+ SDTCisVT<1, v4i32>, // rsrc
+ SDTCisVT<2, i32>, // vindex
+ SDTCisVT<3, i32>, // offset
+ SDTCisVT<4, i1>, // glc
+ SDTCisVT<5, i1>]>; // slc
+
+def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>;
+def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>;
+
+class SDBufferAtomic<string opcode> : SDNode <opcode,
+ SDTypeProfile<1, 5,
+ [SDTCisVT<0, i32>, // dst
+ SDTCisVT<1, i32>, // vdata
+ SDTCisVT<2, v4i32>, // rsrc
+ SDTCisVT<3, i32>, // vindex
+ SDTCisVT<4, i32>, // offset
+ SDTCisVT<5, i1>]>, // slc
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
+>;
+
+def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">;
+def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">;
+def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">;
+def SIbuffer_atomic_smin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMIN">;
+def SIbuffer_atomic_umin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMIN">;
+def SIbuffer_atomic_smax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMAX">;
+def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">;
+def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">;
+def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">;
+def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
+
+def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
+ SDTypeProfile<1, 6,
+ [SDTCisVT<0, i32>, // dst
+ SDTCisVT<1, i32>, // src
+ SDTCisVT<2, i32>, // cmp
+ SDTCisVT<3, v4i32>, // rsrc
+ SDTCisVT<4, i32>, // vindex
+ SDTCisVT<5, i32>, // offset
+ SDTCisVT<6, i1>]>, // slc
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
+>;
+
class SDSample<string opcode> : SDNode <opcode,
SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>,
SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
@@ -110,81 +161,113 @@ def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
+def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
+def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
+
//===----------------------------------------------------------------------===//
-// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
-// to be glued to the memory instructions.
+// SDNodes PatFrags for loads/stores with a glue input.
+// This is for SDNodes and PatFrag for local loads and stores to
+// enable s_mov_b32 m0, -1 to be glued to the memory instructions.
+//
+// These mirror the regular load/store PatFrags and rely on special
+// processing during Select() to add the glued copy.
+//
//===----------------------------------------------------------------------===//
-def SIld_local : SDNode <"ISD::LOAD", SDTLoad,
+def AMDGPUld_glue : SDNode <"ISD::LOAD", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
>;
-def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{
- return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
+def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
}]>;
-def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
- return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
- cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr), [{
+ return cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
}]>;
-def si_load_local_align8 : Aligned8Bytes <
- (ops node:$ptr), (si_load_local node:$ptr)
->;
+def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr), [{
+ return cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD;
+}]>;
-def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
+def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{
return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
}]>;
-def si_az_extload_local : AZExtLoadBase <si_ld_local>;
-multiclass SIExtLoadLocal <PatFrag ld_node> {
+def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{
+ return cast<LoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
+}]>;
- def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
- [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;}]
- >;
+def az_extload_glue : AZExtLoadBase <unindexedload_glue>;
- def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
- [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;}]
- >;
-}
+def az_extloadi8_glue : PatFrag<(ops node:$ptr), (az_extload_glue node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
-defm si_sextload_local : SIExtLoadLocal <si_sextload_local>;
-defm si_az_extload_local : SIExtLoadLocal <si_az_extload_local>;
+def az_extloadi16_glue : PatFrag<(ops node:$ptr), (az_extload_glue node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def sextloadi8_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
-def SIst_local : SDNode <"ISD::STORE", SDTStore,
+def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def load_glue_align8 : Aligned8Bytes <
+ (ops node:$ptr), (load_glue node:$ptr)
+>;
+
+
+def load_local_m0 : LoadFrag<load_glue>, LocalAddress;
+def sextloadi8_local_m0 : LoadFrag<sextloadi8_glue>, LocalAddress;
+def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress;
+def az_extloadi8_local_m0 : LoadFrag<az_extloadi8_glue>, LocalAddress;
+def az_extloadi16_local_m0 : LoadFrag<az_extloadi16_glue>, LocalAddress;
+def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress;
+
+
+def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
>;
-def si_st_local : PatFrag <
- (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
+def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr),
+ (AMDGPUst_glue node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
}]>;
-def si_store_local : PatFrag <
- (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
- !cast<StoreSDNode>(N)->isTruncatingStore();
+def store_glue : PatFrag<(ops node:$val, node:$ptr),
+ (unindexedstore_glue node:$val, node:$ptr), [{
+ return !cast<StoreSDNode>(N)->isTruncatingStore();
}]>;
-def si_store_local_align8 : Aligned8Bytes <
- (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr)
->;
-
-def si_truncstore_local : PatFrag <
- (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
+def truncstore_glue : PatFrag<(ops node:$val, node:$ptr),
+ (unindexedstore_glue node:$val, node:$ptr), [{
return cast<StoreSDNode>(N)->isTruncatingStore();
}]>;
-def si_truncstore_local_i8 : PatFrag <
- (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
+def truncstorei8_glue : PatFrag<(ops node:$val, node:$ptr),
+ (truncstore_glue node:$val, node:$ptr), [{
return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8;
}]>;
-def si_truncstore_local_i16 : PatFrag <
- (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
+def truncstorei16_glue : PatFrag<(ops node:$val, node:$ptr),
+ (truncstore_glue node:$val, node:$ptr), [{
return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16;
}]>;
+def store_glue_align8 : Aligned8Bytes <
+ (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
+>;
+
+def store_local_m0 : StoreFrag<store_glue>, LocalAddress;
+def truncstorei8_local_m0 : StoreFrag<truncstorei8_glue>, LocalAddress;
+def truncstorei16_local_m0 : StoreFrag<truncstorei16_glue>, LocalAddress;
+
+def store_align8_local_m0 : StoreFrag<store_glue_align8>, LocalAddress;
+
def si_setcc_uniform : PatFrag <
(ops node:$lhs, node:$rhs, node:$cond),
(setcc node:$lhs, node:$rhs, node:$cond), [{
@@ -199,16 +282,6 @@ def si_setcc_uniform : PatFrag <
return true;
}]>;
-def si_uniform_br : PatFrag <
- (ops node:$cond, node:$bb), (brcond node:$cond, node:$bb), [{
- return isUniformBr(N);
-}]>;
-
-def si_uniform_br_scc : PatFrag <
- (ops node:$cond, node:$bb), (si_uniform_br node:$cond, node:$bb), [{
- return isCBranchSCC(N);
-}]>;
-
def lshr_rev : PatFrag <
(ops node:$src1, node:$src0),
(srl $src0, $src1)
@@ -231,27 +304,28 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
>;
- def _local : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+ def _local_m0 : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
}
-defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
-defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
-defm si_atomic_inc : SIAtomicM0Glue2 <"INC", 1>;
-defm si_atomic_dec : SIAtomicM0Glue2 <"DEC", 1>;
-defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
-defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
-defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
-defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">;
-defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
-defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
-defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
-defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">;
+defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
+defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
+defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>;
+defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>;
+defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
+defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
+defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
+defm atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">;
+defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
+defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
+defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
+defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
-def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
+def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
>;
-defm si_atomic_cmp_swap : AtomicCmpSwapLocal <si_atomic_cmp_swap_glue>;
+def atomic_cmp_swap_local_m0 : AtomicCmpSwapLocal<atomic_cmp_swap_glue>;
+
def as_i1imm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
@@ -273,6 +347,10 @@ def as_i64imm: SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
}]>;
+def cond_as_i32imm: SDNodeXForm<cond, [{
+ return CurDAG->getTargetConstant(N->get(), SDLoc(N), MVT::i32);
+}]>;
+
// Copied from the AArch64 backend:
def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
return CurDAG->getTargetConstant(
@@ -556,6 +634,7 @@ def gds : NamedOperandBit<"GDS", NamedMatchClass<"GDS">>;
def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>;
def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
+def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>;
def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
@@ -659,6 +738,15 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>
def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
+class OpSelModsMatchClass : AsmOperandClass {
+ let Name = "OpSelMods";
+ let ParserMethod = "parseRegOrImm";
+ let PredicateMethod = "isRegOrImm";
+}
+
+def IntOpSelModsMatchClass : OpSelModsMatchClass;
+def IntOpSelMods : InputMods<IntOpSelModsMatchClass>;
+
def FPRegSDWAInputModsMatchClass : AsmOperandClass {
let Name = "SDWARegWithFPInputMods";
let ParserMethod = "parseRegWithFPInputMods";
@@ -750,6 +838,16 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
def VOP3PMods0 : ComplexPattern<untyped, 3, "SelectVOP3PMods0">;
+def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
+def VOP3OpSel0 : ComplexPattern<untyped, 3, "SelectVOP3OpSel0">;
+
+def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
+def VOP3OpSelMods0 : ComplexPattern<untyped, 3, "SelectVOP3OpSelMods0">;
+
+def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
+
+
+def Hi16Elt : ComplexPattern<untyped, 1, "SelectHi16Elt">;
//===----------------------------------------------------------------------===//
// SI assembler operands
@@ -771,6 +869,7 @@ def SRCMODS {
int NEG_HI = ABS;
int OP_SEL_0 = 4;
int OP_SEL_1 = 8;
+ int DST_OP_SEL = 8;
}
def DSTCLAMP {
@@ -827,7 +926,7 @@ class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon<
// Split EXP instruction into EXP and EXP_DONE so we can set
// mayLoad for done=1.
multiclass EXP_m<bit done, SDPatternOperator node> {
- let mayLoad = done in {
+ let mayLoad = done, DisableWQM = 1 in {
let isPseudo = 1, isCodeGenOnly = 1 in {
def "" : EXP_Helper<done, node>,
SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>;
@@ -943,7 +1042,7 @@ class getVOP3SrcForVT<ValueType VT> {
VCSrc_f64,
VCSrc_b64),
!if(!eq(VT.Value, i1.Value),
- SCSrc_b64,
+ SCSrc_i1,
!if(isFP,
!if(!eq(VT.Value, f16.Value),
VCSrc_f16,
@@ -1020,6 +1119,10 @@ class getSrcMod <ValueType VT> {
);
}
+class getOpSelMod <ValueType VT> {
+ Operand ret = !if(!eq(VT.Value, f16.Value), FP16InputMods, IntOpSelMods);
+}
+
// Return type of input modifiers operand specified input operand for DPP
class getSrcModExt <ValueType VT> {
bit isFP = !if(!eq(VT.Value, f16.Value), 1,
@@ -1048,7 +1151,7 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
// Returns the input arguments for VOP3 instructions for the given SrcVT.
class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
RegisterOperand Src2RC, int NumSrcArgs,
- bit HasModifiers, bit HasOMod,
+ bit HasIntClamp, bit HasModifiers, bit HasOMod,
Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
dag ret =
@@ -1063,7 +1166,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
clampmod:$clamp, omod:$omod)
/* else */,
// VOP1 without modifiers
- (ins Src0RC:$src0)
+ !if (!eq(HasIntClamp, 1),
+ (ins Src0RC:$src0, clampmod:$clamp),
+ (ins Src0RC:$src0))
/* endif */ ),
!if (!eq(NumSrcArgs, 2),
!if (!eq(HasModifiers, 1),
@@ -1077,7 +1182,10 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
clampmod:$clamp))
/* else */,
// VOP2 without modifiers
- (ins Src0RC:$src0, Src1RC:$src1)
+ !if (!eq(HasIntClamp, 1),
+ (ins Src0RC:$src0, Src1RC:$src1, clampmod:$clamp),
+ (ins Src0RC:$src0, Src1RC:$src1))
+
/* endif */ )
/* NumSrcArgs == 3 */,
!if (!eq(HasModifiers, 1),
@@ -1093,7 +1201,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
clampmod:$clamp))
/* else */,
// VOP3 without modifiers
- (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)
+ !if (!eq(HasIntClamp, 1),
+ (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2, clampmod:$clamp),
+ (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2))
/* endif */ ))));
}
@@ -1133,8 +1243,40 @@ class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
);
}
-class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
- bit HasModifiers, Operand Src0Mod, Operand Src1Mod> {
+class getInsVOP3OpSel <RegisterOperand Src0RC,
+ RegisterOperand Src1RC,
+ RegisterOperand Src2RC,
+ int NumSrcArgs,
+ bit HasClamp,
+ Operand Src0Mod,
+ Operand Src1Mod,
+ Operand Src2Mod> {
+ dag ret = !if (!eq(NumSrcArgs, 2),
+ !if (HasClamp,
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ clampmod:$clamp,
+ op_sel:$op_sel),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ op_sel:$op_sel)),
+ // else NumSrcArgs == 3
+ !if (HasClamp,
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2Mod:$src2_modifiers, Src2RC:$src2,
+ clampmod:$clamp,
+ op_sel:$op_sel),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2Mod:$src2_modifiers, Src2RC:$src2,
+ op_sel:$op_sel))
+ );
+}
+
+class getInsDPP <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+ int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod> {
dag ret = !if (!eq(NumSrcArgs, 0),
// VOP1 without input operands (V_NOP)
@@ -1143,26 +1285,29 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
!if (!eq(NumSrcArgs, 1),
!if (!eq(HasModifiers, 1),
// VOP1_DPP with modifiers
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ (ins DstRC:$old, Src0Mod:$src0_modifiers,
+ Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
/* else */,
// VOP1_DPP without modifiers
- (ins Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
- bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+ (ins DstRC:$old, Src0RC:$src0,
+ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
/* endif */)
/* NumSrcArgs == 2 */,
!if (!eq(HasModifiers, 1),
// VOP2_DPP with modifiers
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ (ins DstRC:$old,
+ Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
/* else */,
// VOP2_DPP without modifiers
- (ins Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl,
- row_mask:$row_mask, bank_mask:$bank_mask,
- bound_ctrl:$bound_ctrl)
+ (ins DstRC:$old,
+ Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl,
+ row_mask:$row_mask, bank_mask:$bank_mask,
+ bound_ctrl:$bound_ctrl)
/* endif */)));
}
@@ -1246,7 +1391,7 @@ class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
// Returns the assembly string for the inputs and outputs of a VOP3
// instruction.
-class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers,
+class getAsm64 <bit HasDst, int NumSrcArgs, bit HasIntClamp, bit HasModifiers,
bit HasOMod, ValueType DstVT = i32> {
string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC
string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
@@ -1254,9 +1399,10 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers,
!if(!eq(NumSrcArgs, 2), " $src1_modifiers",
" $src1_modifiers,"));
string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
+ string iclamp = !if(HasIntClamp, "$clamp", "");
string ret =
!if(!eq(HasModifiers, 0),
- getAsm32<HasDst, NumSrcArgs, DstVT>.ret,
+ getAsm32<HasDst, NumSrcArgs, DstVT>.ret # iclamp,
dst#", "#src0#src1#src2#"$clamp"#!if(HasOMod, "$omod", ""));
}
@@ -1279,6 +1425,34 @@ class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers,
string ret = dst#", "#src0#src1#src2#"$op_sel$op_sel_hi"#mods#clamp;
}
+class getAsmVOP3OpSel <int NumSrcArgs,
+ bit HasClamp,
+ bit Src0HasMods,
+ bit Src1HasMods,
+ bit Src2HasMods> {
+ string dst = " $vdst";
+
+ string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
+ string isrc1 = !if(!eq(NumSrcArgs, 1), "",
+ !if(!eq(NumSrcArgs, 2), " $src1",
+ " $src1,"));
+ string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", "");
+
+ string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+ string fsrc1 = !if(!eq(NumSrcArgs, 1), "",
+ !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+ " $src1_modifiers,"));
+ string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
+
+ string src0 = !if(Src0HasMods, fsrc0, isrc0);
+ string src1 = !if(Src1HasMods, fsrc1, isrc1);
+ string src2 = !if(Src2HasMods, fsrc2, isrc2);
+
+ string clamp = !if(HasClamp, "$clamp", "");
+
+ string ret = dst#", "#src0#src1#src2#"$op_sel"#clamp;
+}
+
class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
string dst = !if(HasDst,
!if(!eq(DstVT.Size, 1),
@@ -1433,6 +1607,10 @@ class VOPProfile <list<ValueType> _ArgVT> {
field bit HasClamp = HasModifiers;
field bit HasSDWAClamp = EmitDst;
field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;
+ field bit HasIntClamp = !if(isFloatType<DstVT>.ret, 0, HasClamp);
+ field bit HasClampLo = HasClamp;
+ field bit HasClampHi = BitAnd<isPackedType<DstVT>.ret, HasClamp>.ret;
+ field bit HasHigh = 0;
field bit IsPacked = isPackedType<Src0VT>.ret;
field bit HasOpSel = IsPacked;
@@ -1457,13 +1635,18 @@ class VOPProfile <list<ValueType> _ArgVT> {
field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
- HasModifiers, HasOMod, Src0Mod, Src1Mod,
+ HasIntClamp, HasModifiers, HasOMod, Src0Mod, Src1Mod,
Src2Mod>.ret;
field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64,
NumSrcArgs, HasClamp,
Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret;
-
- field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
+ field dag InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64,
+ NumSrcArgs,
+ HasClamp,
+ getOpSelMod<Src0VT>.ret,
+ getOpSelMod<Src1VT>.ret,
+ getOpSelMod<Src2VT>.ret>.ret;
+ field dag InsDPP = getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
@@ -1471,8 +1654,13 @@ class VOPProfile <list<ValueType> _ArgVT> {
field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
- field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret;
+ field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasIntClamp, HasModifiers, HasOMod, DstVT>.ret;
field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret;
+ field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
+ HasClamp,
+ HasSrc0FloatMods,
+ HasSrc1FloatMods,
+ HasSrc2FloatMods>.ret;
field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
@@ -1495,6 +1683,8 @@ def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
+def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>;
+
def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>;
def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>;
@@ -1527,6 +1717,7 @@ def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
+def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
@@ -1632,7 +1823,31 @@ def getBasicFromSDWAOp : InstrMapping {
let ValueCols = [["Default"]];
}
-def getMaskedMIMGOp : InstrMapping {
+def getMaskedMIMGOp1 : InstrMapping {
+ let FilterClass = "MIMG_Mask";
+ let RowFields = ["Op"];
+ let ColFields = ["Channels"];
+ let KeyCol = ["1"];
+ let ValueCols = [["2"], ["3"], ["4"] ];
+}
+
+def getMaskedMIMGOp2 : InstrMapping {
+ let FilterClass = "MIMG_Mask";
+ let RowFields = ["Op"];
+ let ColFields = ["Channels"];
+ let KeyCol = ["2"];
+ let ValueCols = [["1"], ["3"], ["4"] ];
+}
+
+def getMaskedMIMGOp3 : InstrMapping {
+ let FilterClass = "MIMG_Mask";
+ let RowFields = ["Op"];
+ let ColFields = ["Channels"];
+ let KeyCol = ["3"];
+ let ValueCols = [["1"], ["2"], ["4"] ];
+}
+
+def getMaskedMIMGOp4 : InstrMapping {
let FilterClass = "MIMG_Mask";
let RowFields = ["Op"];
let ColFields = ["Channels"];
@@ -1666,7 +1881,8 @@ def getMCOpcodeGen : InstrMapping {
let ValueCols = [[!cast<string>(SIEncodingFamily.SI)],
[!cast<string>(SIEncodingFamily.VI)],
[!cast<string>(SIEncodingFamily.SDWA)],
- [!cast<string>(SIEncodingFamily.SDWA9)]];
+ [!cast<string>(SIEncodingFamily.SDWA9)],
+ [!cast<string>(SIEncodingFamily.GFX9)]];
}
// Get equivalent SOPK instruction.
@@ -1705,7 +1921,6 @@ def getAtomicNoRetOp : InstrMapping {
}
include "SIInstructions.td"
-include "CIInstructions.td"
include "DSInstructions.td"
include "MIMGInstructions.td"
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index ba69e42d9125..9740a18b7248 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -11,13 +11,6 @@
// that are not yet supported remain commented out.
//===----------------------------------------------------------------------===//
-def isGCN : Predicate<"Subtarget->getGeneration() "
- ">= SISubtarget::SOUTHERN_ISLANDS">,
- AssemblerPredicate<"FeatureGCN">;
-def isSI : Predicate<"Subtarget->getGeneration() "
- "== SISubtarget::SOUTHERN_ISLANDS">,
- AssemblerPredicate<"FeatureSouthernIslands">;
-
def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
@@ -25,14 +18,17 @@ def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
AssemblerPredicate<"FeatureMovrel">;
+class GCNPat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
+ let SubtargetPredicate = isGCN;
+}
+
+
include "VOPInstructions.td"
include "SOPInstructions.td"
include "SMInstructions.td"
include "FLATInstructions.td"
include "BUFInstructions.td"
-let SubtargetPredicate = isGCN in {
-
//===----------------------------------------------------------------------===//
// EXP Instructions
//===----------------------------------------------------------------------===//
@@ -99,6 +95,7 @@ def ATOMIC_FENCE : SPseudoInstSI<
[(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))],
"ATOMIC_FENCE $ordering, $scope"> {
let hasSideEffects = 1;
+ let maybeAtomic = 1;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
@@ -111,12 +108,67 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
let usesCustomInserter = 1;
}
-// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
-// pass to enable folding of inline immediates.
+// 64-bit vector move instruction. This is mainly used by the
+// SIFoldOperands pass to enable folding of inline immediates.
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)>;
+
+// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
+// WQM pass processes it.
+def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+
+// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
+// that the @earlyclobber is respected. The @earlyclobber is to make sure that
+// the instruction that defines $src0 (which is run in WWM) doesn't
+// accidentally clobber inactive channels of $vdst.
+let Constraints = "@earlyclobber $vdst" in {
+def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+}
+
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
+def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
+// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
+// restoring it after we're done.
+def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
+ (ins VGPR_32: $src, VSrc_b32:$inactive),
+ [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
+ let Constraints = "$src = $vdst";
+}
+
+def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
+ (ins VReg_64: $src, VSrc_b64:$inactive),
+ [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
+ let Constraints = "$src = $vdst";
+}
+
+
+let usesCustomInserter = 1, Defs = [SCC] in {
+def S_ADD_U64_PSEUDO : SPseudoInstSI <
+ (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+ [(set SReg_64:$vdst, (add i64:$src0, i64:$src1))]
+>;
+
+def S_SUB_U64_PSEUDO : SPseudoInstSI <
+ (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+ [(set SReg_64:$vdst, (sub i64:$src0, i64:$src1))]
+>;
+
+def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
+ (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+>;
+
+def S_SUB_U64_CO_PSEUDO : SPseudoInstSI <
+ (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+>;
+
+} // End usesCustomInserter = 1, Defs = [SCC]
+
let usesCustomInserter = 1, SALU = 1 in {
def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
@@ -174,12 +226,14 @@ def SI_MASK_BRANCH : VPseudoInstSI <
let isTerminator = 1 in {
+let OtherPredicates = [EnableLateCFGStructurize] in {
def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
(outs),
(ins SReg_64:$vcc, brtarget:$target),
[(brcond i1:$vcc, bb:$target)]> {
let Size = 12;
}
+}
def SI_IF: CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
@@ -243,18 +297,21 @@ def SI_ELSE_BREAK : CFPseudoInstSI <
}
let Uses = [EXEC], Defs = [EXEC,VCC] in {
-def SI_KILL : PseudoInstSI <
- (outs), (ins VSrc_b32:$src),
- [(AMDGPUkill i32:$src)]> {
- let isConvergent = 1;
- let usesCustomInserter = 1;
-}
-def SI_KILL_TERMINATOR : SPseudoInstSI <
- (outs), (ins VSrc_b32:$src)> {
- let isTerminator = 1;
+multiclass PseudoInstKill <dag ins> {
+ def _PSEUDO : PseudoInstSI <(outs), ins> {
+ let isConvergent = 1;
+ let usesCustomInserter = 1;
+ }
+
+ def _TERMINATOR : SPseudoInstSI <(outs), ins> {
+ let isTerminator = 1;
+ }
}
+defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>;
+defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
+
def SI_ILLEGAL_COPY : SPseudoInstSI <
(outs unknown:$dst), (ins unknown:$src),
[], " ; illegal copy $src to $dst">;
@@ -316,6 +373,82 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI <
let DisableWQM = 1;
}
+// Return for returning function calls.
+def SI_RETURN : SPseudoInstSI <
+ (outs), (ins), [],
+ "; return"> {
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+ let SchedRW = [WriteBranch];
+}
+
+// Return for returning function calls without output register.
+//
+// This version is only needed so we can fill in the output regiter in
+// the custom inserter.
+def SI_CALL_ISEL : SPseudoInstSI <
+ (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)]> {
+ let Size = 4;
+ let isCall = 1;
+ let SchedRW = [WriteBranch];
+ let usesCustomInserter = 1;
+}
+
+// Wrapper around s_swappc_b64 with extra $callee parameter to track
+// the called function after regalloc.
+def SI_CALL : SPseudoInstSI <
+ (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> {
+ let Size = 4;
+ let isCall = 1;
+ let UseNamedOperandTable = 1;
+ let SchedRW = [WriteBranch];
+}
+
+// Tail call handling pseudo
+def SI_TCRETURN_ISEL : SPseudoInstSI<(outs),
+ (ins SSrc_b64:$src0, i32imm:$fpdiff),
+ [(AMDGPUtc_return i64:$src0, i32:$fpdiff)]> {
+ let isCall = 1;
+ let isTerminator = 1;
+ let isReturn = 1;
+ let isBarrier = 1;
+ let SchedRW = [WriteBranch];
+ let usesCustomInserter = 1;
+}
+
+def SI_TCRETURN : SPseudoInstSI <
+ (outs),
+ (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff)> {
+ let Size = 4;
+ let isCall = 1;
+ let isTerminator = 1;
+ let isReturn = 1;
+ let isBarrier = 1;
+ let UseNamedOperandTable = 1;
+ let SchedRW = [WriteBranch];
+}
+
+
+def ADJCALLSTACKUP : SPseudoInstSI<
+ (outs), (ins i32imm:$amt0, i32imm:$amt1),
+ [(callseq_start timm:$amt0, timm:$amt1)],
+ "; adjcallstackup $amt0 $amt1"> {
+ let Size = 8; // Worst case. (s_add_u32 + constant)
+ let FixedSize = 1;
+ let hasSideEffects = 1;
+ let usesCustomInserter = 1;
+}
+
+def ADJCALLSTACKDOWN : SPseudoInstSI<
+ (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(callseq_end timm:$amt1, timm:$amt2)],
+ "; adjcallstackdown $amt1"> {
+ let Size = 8; // Worst case. (s_add_u32 + constant)
+ let hasSideEffects = 1;
+ let usesCustomInserter = 1;
+}
+
let Defs = [M0, EXEC],
UseNamedOperandTable = 1 in {
@@ -416,39 +549,63 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
let Defs = [SCC];
}
-} // End SubtargetPredicate = isGCN
-
-let Predicates = [isGCN] in {
-def : Pat <
+def : GCNPat <
(AMDGPUinit_exec i64:$src),
(SI_INIT_EXEC (as_i64imm $src))
>;
-def : Pat <
+def : GCNPat <
(AMDGPUinit_exec_from_input i32:$input, i32:$shift),
(SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift))
>;
-def : Pat<
+def : GCNPat<
(AMDGPUtrap timm:$trapid),
(S_TRAP $trapid)
>;
-def : Pat<
+def : GCNPat<
(AMDGPUelse i64:$src, bb:$target),
(SI_ELSE $src, $target, 0)
>;
-def : Pat <
+def : GCNPat <
(int_AMDGPU_kilp),
- (SI_KILL (i32 0xbf800000))
+ (SI_KILL_I1_PSEUDO (i1 0), 0)
+>;
+
+def : Pat <
+ // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0)
+ (AMDGPUkill (i32 -1082130432)),
+ (SI_KILL_I1_PSEUDO (i1 0), 0)
+>;
+
+def : Pat <
+ (int_amdgcn_kill i1:$src),
+ (SI_KILL_I1_PSEUDO $src, 0)
+>;
+
+def : Pat <
+ (int_amdgcn_kill (i1 (not i1:$src))),
+ (SI_KILL_I1_PSEUDO $src, -1)
+>;
+
+def : Pat <
+ (AMDGPUkill i32:$src),
+ (SI_KILL_F32_COND_IMM_PSEUDO $src, 0, 3) // 3 means SETOGE
+>;
+
+def : Pat <
+ (int_amdgcn_kill (i1 (setcc f32:$src, InlineFPImm<f32>:$imm, cond:$cond))),
+ (SI_KILL_F32_COND_IMM_PSEUDO $src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
>;
+// TODO: we could add more variants for other types of conditionals
//===----------------------------------------------------------------------===//
// VOP1 Patterns
//===----------------------------------------------------------------------===//
-let Predicates = [UnsafeFPMath] in {
+let SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath] in {
//def : RcpPat<V_RCP_F64_e32, f64>;
//defm : RsqPat<V_RSQ_F64_e32, f64>;
@@ -458,70 +615,70 @@ def : RsqPat<V_RSQ_F32_e32, f32>;
def : RsqPat<V_RSQ_F64_e32, f64>;
// Convert (x - floor(x)) to fract(x)
-def : Pat <
+def : GCNPat <
(f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
(f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
(V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
// Convert (x + (-floor(x))) to fract(x)
-def : Pat <
+def : GCNPat <
(f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
(f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-} // End Predicates = [UnsafeFPMath]
+} // End SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath]
// f16_to_fp patterns
-def : Pat <
+def : GCNPat <
(f32 (f16_to_fp i32:$src0)),
(V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-def : Pat <
+def : GCNPat <
(f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
(V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-def : Pat <
+def : GCNPat <
(f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
(V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-def : Pat <
+def : GCNPat <
(f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
(V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-def : Pat <
+def : GCNPat <
(f64 (fpextend f16:$src)),
(V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
>;
// fp_to_fp16 patterns
-def : Pat <
+def : GCNPat <
(i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
(V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-def : Pat <
+def : GCNPat <
(i32 (fp_to_sint f16:$src)),
(V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src))
>;
-def : Pat <
+def : GCNPat <
(i32 (fp_to_uint f16:$src)),
(V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src))
>;
-def : Pat <
+def : GCNPat <
(f16 (sint_to_fp i32:$src)),
(V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src))
>;
-def : Pat <
+def : GCNPat <
(f16 (uint_to_fp i32:$src)),
(V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src))
>;
@@ -531,7 +688,7 @@ def : Pat <
//===----------------------------------------------------------------------===//
multiclass FMADPat <ValueType vt, Instruction inst> {
- def : Pat <
+ def : GCNPat <
(vt (fmad (VOP3NoMods vt:$src0),
(VOP3NoMods vt:$src1),
(VOP3NoMods vt:$src2))),
@@ -543,7 +700,7 @@ multiclass FMADPat <ValueType vt, Instruction inst> {
defm : FMADPat <f16, V_MAC_F16_e64>;
defm : FMADPat <f32, V_MAC_F32_e64>;
-class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : Pat<
+class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : GCNPat<
(f32 (mad_opr (VOP3Mods f32:$src0, i32:$src0_mod),
(VOP3Mods f32:$src1, i32:$src1_mod),
(VOP3Mods f32:$src2, i32:$src2_mod))),
@@ -554,7 +711,7 @@ class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : Pat<
def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz>;
multiclass SelectPat <ValueType vt, Instruction inst> {
- def : Pat <
+ def : GCNPat <
(vt (select i1:$src0, vt:$src1, vt:$src2)),
(inst $src2, $src1, $src0)
>;
@@ -565,7 +722,7 @@ defm : SelectPat <i32, V_CNDMASK_B32_e64>;
defm : SelectPat <f16, V_CNDMASK_B32_e64>;
defm : SelectPat <f32, V_CNDMASK_B32_e64>;
-def : Pat <
+def : GCNPat <
(i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
@@ -638,6 +795,8 @@ foreach Index = 0-15 in {
>;
}
+let SubtargetPredicate = isGCN in {
+
// FIXME: Why do only some of these type combinations for SReg and
// VReg?
// 16-bit bitcast
@@ -698,6 +857,8 @@ def : BitConvert <v8f32, v8i32, VReg_256>;
def : BitConvert <v16i32, v16f32, VReg_512>;
def : BitConvert <v16f32, v16i32, VReg_512>;
+} // End SubtargetPredicate = isGCN
+
/********** =================== **********/
/********** Src & Dst modifiers **********/
/********** =================== **********/
@@ -705,7 +866,7 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
// If denormals are not enabled, it only impacts the compare of the
// inputs. The output result is not flushed.
-class ClampPat<Instruction inst, ValueType vt> : Pat <
+class ClampPat<Instruction inst, ValueType vt> : GCNPat <
(vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))),
(inst i32:$src0_modifiers, vt:$src0,
i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE)
@@ -715,19 +876,25 @@ def : ClampPat<V_MAX_F32_e64, f32>;
def : ClampPat<V_MAX_F64, f64>;
def : ClampPat<V_MAX_F16_e64, f16>;
+def : GCNPat <
+ (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))),
+ (V_PK_MAX_F16 $src0_modifiers, $src0,
+ $src0_modifiers, $src0, DSTCLAMP.ENABLE)
+>;
+
/********** ================================ **********/
/********** Floating point absolute/negative **********/
/********** ================================ **********/
// Prevent expanding both fneg and fabs.
-def : Pat <
+def : GCNPat <
(fneg (fabs f32:$src)),
(S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
>;
// FIXME: Should use S_OR_B32
-def : Pat <
+def : GCNPat <
(fneg (fabs f64:$src)),
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
@@ -737,17 +904,17 @@ def : Pat <
sub1)
>;
-def : Pat <
+def : GCNPat <
(fabs f32:$src),
(V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
>;
-def : Pat <
+def : GCNPat <
(fneg f32:$src),
(V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
>;
-def : Pat <
+def : GCNPat <
(fabs f64:$src),
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
@@ -757,7 +924,7 @@ def : Pat <
sub1)
>;
-def : Pat <
+def : GCNPat <
(fneg f64:$src),
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
@@ -767,18 +934,18 @@ def : Pat <
sub1)
>;
-def : Pat <
+def : GCNPat <
(fcopysign f16:$src0, f16:$src1),
(V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
>;
-def : Pat <
+def : GCNPat <
(fcopysign f32:$src0, f16:$src1),
(V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), $src0,
(V_LSHLREV_B32_e64 (i32 16), $src1))
>;
-def : Pat <
+def : GCNPat <
(fcopysign f64:$src0, f16:$src1),
(REG_SEQUENCE SReg_64,
(i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
@@ -786,39 +953,39 @@ def : Pat <
(V_LSHLREV_B32_e64 (i32 16), $src1)), sub1)
>;
-def : Pat <
+def : GCNPat <
(fcopysign f16:$src0, f32:$src1),
(V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), $src1))
>;
-def : Pat <
+def : GCNPat <
(fcopysign f16:$src0, f64:$src1),
(V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
>;
-def : Pat <
+def : GCNPat <
(fneg f16:$src),
(V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
>;
-def : Pat <
+def : GCNPat <
(fabs f16:$src),
(V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
>;
-def : Pat <
+def : GCNPat <
(fneg (fabs f16:$src)),
(S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
>;
-def : Pat <
+def : GCNPat <
(fneg v2f16:$src),
(V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src)
>;
-def : Pat <
+def : GCNPat <
(fabs v2f16:$src),
(V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src)
>;
@@ -827,7 +994,7 @@ def : Pat <
//
// fabs is not reported as free because there is modifier for it in
// VOP3P instructions, so it is turned into the bit op.
-def : Pat <
+def : GCNPat <
(fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
(S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit
>;
@@ -836,17 +1003,17 @@ def : Pat <
/********** Immediate Patterns **********/
/********** ================== **********/
-def : Pat <
+def : GCNPat <
(VGPRImm<(i32 imm)>:$imm),
(V_MOV_B32_e32 imm:$imm)
>;
-def : Pat <
+def : GCNPat <
(VGPRImm<(f32 fpimm)>:$imm),
(V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
>;
-def : Pat <
+def : GCNPat <
(i32 imm:$imm),
(S_MOV_B32 imm:$imm)
>;
@@ -854,27 +1021,27 @@ def : Pat <
// FIXME: Workaround for ordering issue with peephole optimizer where
// a register class copy interferes with immediate folding. Should
// use s_mov_b32, which can be shrunk to s_movk_i32
-def : Pat <
+def : GCNPat <
(VGPRImm<(f16 fpimm)>:$imm),
(V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
>;
-def : Pat <
+def : GCNPat <
(f32 fpimm:$imm),
(S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
>;
-def : Pat <
+def : GCNPat <
(f16 fpimm:$imm),
(S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
>;
-def : Pat <
+def : GCNPat <
(i32 frameindex:$fi),
(V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
>;
-def : Pat <
+def : GCNPat <
(i64 InlineImm<i64>:$imm),
(S_MOV_B64 InlineImm<i64>:$imm)
>;
@@ -882,12 +1049,12 @@ def : Pat <
// XXX - Should this use a s_cmp to set SCC?
// Set to sign-extended 64-bit value (true = -1, false = 0)
-def : Pat <
+def : GCNPat <
(i1 imm:$imm),
(S_MOV_B64 (i64 (as_i64imm $imm)))
>;
-def : Pat <
+def : GCNPat <
(f64 InlineFPImm<f64>:$imm),
(S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm)))
>;
@@ -896,14 +1063,16 @@ def : Pat <
/********** Intrinsic Patterns **********/
/********** ================== **********/
+let SubtargetPredicate = isGCN in {
def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
+}
-def : Pat <
+def : GCNPat <
(i32 (sext i1:$src0)),
(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
>;
-class Ext32Pat <SDNode ext> : Pat <
+class Ext32Pat <SDNode ext> : GCNPat <
(i32 (ext i1:$src0)),
(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
>;
@@ -912,7 +1081,7 @@ def : Ext32Pat <zext>;
def : Ext32Pat <anyext>;
// The multiplication scales from [0,1] to the unsigned integer range
-def : Pat <
+def : GCNPat <
(AMDGPUurecip i32:$src0),
(V_CVT_U32_F32_e32
(V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
@@ -923,17 +1092,21 @@ def : Pat <
// VOP3 Patterns
//===----------------------------------------------------------------------===//
-def : IMad24Pat<V_MAD_I32_I24>;
-def : UMad24Pat<V_MAD_U32_U24>;
+let SubtargetPredicate = isGCN in {
+
+def : IMad24Pat<V_MAD_I32_I24, 1>;
+def : UMad24Pat<V_MAD_U32_U24, 1>;
defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
def : ROTRPattern <V_ALIGNBIT_B32>;
-def : Pat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
+}
+
+def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
-def : Pat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
+def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
(V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
@@ -943,13 +1116,13 @@ def : Pat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
// Extract with offset
- def : Pat<
+ def : GCNPat<
(eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
(!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
>;
// Insert with offset
- def : Pat<
+ def : GCNPat<
(insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
(!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
>;
@@ -969,70 +1142,70 @@ defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
// SAD Patterns
//===----------------------------------------------------------------------===//
-def : Pat <
+def : GCNPat <
(add (sub_oneuse (umax i32:$src0, i32:$src1),
(umin i32:$src0, i32:$src1)),
i32:$src2),
- (V_SAD_U32 $src0, $src1, $src2)
+ (V_SAD_U32 $src0, $src1, $src2, (i1 0))
>;
-def : Pat <
+def : GCNPat <
(add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)),
(sub i32:$src0, i32:$src1),
(sub i32:$src1, i32:$src0)),
i32:$src2),
- (V_SAD_U32 $src0, $src1, $src2)
+ (V_SAD_U32 $src0, $src1, $src2, (i1 0))
>;
//===----------------------------------------------------------------------===//
// Conversion Patterns
//===----------------------------------------------------------------------===//
-def : Pat<(i32 (sext_inreg i32:$src, i1)),
+def : GCNPat<(i32 (sext_inreg i32:$src, i1)),
(S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
// Handle sext_inreg in i64
-def : Pat <
+def : GCNPat <
(i64 (sext_inreg i64:$src, i1)),
(S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
>;
-def : Pat <
+def : GCNPat <
(i16 (sext_inreg i16:$src, i1)),
(S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
>;
-def : Pat <
+def : GCNPat <
(i16 (sext_inreg i16:$src, i8)),
(S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
>;
-def : Pat <
+def : GCNPat <
(i64 (sext_inreg i64:$src, i8)),
(S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
>;
-def : Pat <
+def : GCNPat <
(i64 (sext_inreg i64:$src, i16)),
(S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
>;
-def : Pat <
+def : GCNPat <
(i64 (sext_inreg i64:$src, i32)),
(S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
>;
-def : Pat <
+def : GCNPat <
(i64 (zext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
>;
-def : Pat <
+def : GCNPat <
(i64 (anyext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
>;
-class ZExt_i64_i1_Pat <SDNode ext> : Pat <
+class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <
(i64 (ext i1:$src)),
(REG_SEQUENCE VReg_64,
(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
@@ -1045,20 +1218,20 @@ def : ZExt_i64_i1_Pat<anyext>;
// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
// REG_SEQUENCE patterns don't support instructions with multiple outputs.
-def : Pat <
+def : GCNPat <
(i64 (sext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0,
(i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
>;
-def : Pat <
+def : GCNPat <
(i64 (sext i1:$src)),
(REG_SEQUENCE VReg_64,
(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
>;
-class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
+class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat <
(i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
(i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
>;
@@ -1074,37 +1247,37 @@ def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
// 64-bit comparisons. When legalizing SGPR copies, instructions
// resulting in the copies from SCC to these instructions will be
// moved to the VALU.
-def : Pat <
+def : GCNPat <
(i1 (and i1:$src0, i1:$src1)),
(S_AND_B64 $src0, $src1)
>;
-def : Pat <
+def : GCNPat <
(i1 (or i1:$src0, i1:$src1)),
(S_OR_B64 $src0, $src1)
>;
-def : Pat <
+def : GCNPat <
(i1 (xor i1:$src0, i1:$src1)),
(S_XOR_B64 $src0, $src1)
>;
-def : Pat <
+def : GCNPat <
(f32 (sint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
>;
-def : Pat <
+def : GCNPat <
(f32 (uint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
>;
-def : Pat <
+def : GCNPat <
(f64 (sint_to_fp i1:$src)),
(V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
>;
-def : Pat <
+def : GCNPat <
(f64 (uint_to_fp i1:$src)),
(V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
>;
@@ -1112,79 +1285,95 @@ def : Pat <
//===----------------------------------------------------------------------===//
// Miscellaneous Patterns
//===----------------------------------------------------------------------===//
-def : Pat <
+def : GCNPat <
(i32 (AMDGPUfp16_zext f16:$src)),
(COPY $src)
>;
-def : Pat <
+def : GCNPat <
(i32 (trunc i64:$a)),
(EXTRACT_SUBREG $a, sub0)
>;
-def : Pat <
+def : GCNPat <
(i1 (trunc i32:$a)),
(V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
>;
-def : Pat <
+def : GCNPat <
(i1 (trunc i16:$a)),
(V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
>;
-def : Pat <
+def : GCNPat <
(i1 (trunc i64:$a)),
(V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
>;
-def : Pat <
+def : GCNPat <
(i32 (bswap i32:$a)),
(V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
(V_ALIGNBIT_B32 $a, $a, (i32 24)),
(V_ALIGNBIT_B32 $a, $a, (i32 8)))
>;
-multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
- def : Pat <
- (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
- (BFM $a, $b)
- >;
+let OtherPredicates = [NoFP16Denormals] in {
+def : GCNPat<
+ (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
+ (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0)
+>;
- def : Pat <
- (vt (add (vt (shl 1, vt:$a)), -1)),
- (BFM $a, (MOV (i32 0)))
- >;
+def : GCNPat<
+ (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
+ (V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
+>;
}
-defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
-// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
-defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
-
-def : Pat<
+let OtherPredicates = [FP16Denormals] in {
+def : GCNPat<
(fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
- (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0)
+ (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)
>;
-def : Pat<
+def : GCNPat<
+ (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
+ (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)
+>;
+}
+
+let OtherPredicates = [NoFP32Denormals] in {
+def : GCNPat<
(fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
(V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0)
>;
+}
+
+let OtherPredicates = [FP32Denormals] in {
+def : GCNPat<
+ (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
+ (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src, 0, 0)
+>;
+}
-def : Pat<
+let OtherPredicates = [NoFP64Denormals] in {
+def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
(V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0)
>;
+}
-def : Pat<
- (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
- (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
+let OtherPredicates = [FP64Denormals] in {
+def : GCNPat<
+ (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
+ (V_MAX_F64 $src_mods, $src, $src_mods, $src, 0, 0)
>;
+}
// Allow integer inputs
-class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat<
+class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
(node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)),
(Inst i8:$tgt, vt:$src0, vt:$src1, vt:$src2, vt:$src3, i1:$vm, i1:$compr, i8:$en)
>;
@@ -1192,36 +1381,43 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat<
def : ExpPattern<AMDGPUexport, i32, EXP>;
def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
-def : Pat <
+def : GCNPat <
(v2i16 (build_vector i16:$src0, i16:$src1)),
(v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
>;
+// COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs
+// from S_LSHL_B32's multiple outputs from implicit scc def.
+def : GCNPat <
+ (v2i16 (build_vector (i16 0), i16:$src1)),
+ (v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0))
+>;
+
// With multiple uses of the shift, this will duplicate the shift and
// increase register pressure.
-def : Pat <
+def : GCNPat <
(v2i16 (build_vector i16:$src0, (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
(v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
>;
-def : Pat <
+def : GCNPat <
(v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
(i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
(v2i16 (S_PACK_HH_B32_B16 $src0, $src1))
>;
// TODO: Should source modifiers be matched to v_pack_b32_f16?
-def : Pat <
+def : GCNPat <
(v2f16 (build_vector f16:$src0, f16:$src1)),
(v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
>;
-// def : Pat <
+// def : GCNPat <
// (v2f16 (scalar_to_vector f16:$src0)),
// (COPY $src0)
// >;
-// def : Pat <
+// def : GCNPat <
// (v2i16 (scalar_to_vector i16:$src0)),
// (COPY $src0)
// >;
@@ -1230,7 +1426,7 @@ def : Pat <
// Fract Patterns
//===----------------------------------------------------------------------===//
-let Predicates = [isSI] in {
+let SubtargetPredicate = isSI in {
// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
@@ -1239,7 +1435,7 @@ let Predicates = [isSI] in {
// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
// Convert floor(x) to (x - fract(x))
-def : Pat <
+def : GCNPat <
(f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
(V_ADD_F64
$mods,
@@ -1257,7 +1453,7 @@ def : Pat <
DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-} // End Predicates = [isSI]
+} // End SubtargetPredicates = isSI
//============================================================================//
// Miscellaneous Optimization Patterns
@@ -1266,20 +1462,41 @@ def : Pat <
// Undo sub x, c -> add x, -c canonicalization since c is more likely
// an inline immediate than -c.
// TODO: Also do for 64-bit.
-def : Pat<
+def : GCNPat<
(add i32:$src0, (i32 NegSubInlineConst32:$src1)),
(S_SUB_I32 $src0, NegSubInlineConst32:$src1)
>;
+
+multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
+ def : GCNPat <
+ (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
+ (BFM $a, $b)
+ >;
+
+ def : GCNPat <
+ (vt (add (vt (shl 1, vt:$a)), -1)),
+ (BFM $a, (MOV (i32 0)))
+ >;
+}
+
+let SubtargetPredicate = isGCN in {
+
+defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
+// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
+
+defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
+}
+
// This matches 16 permutations of
// max(min(x, y), min(max(x, y), z))
class FPMed3Pat<ValueType vt,
- Instruction med3Inst> : Pat<
+ Instruction med3Inst> : GCNPat<
(fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
(VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
(fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
@@ -1288,20 +1505,30 @@ class FPMed3Pat<ValueType vt,
(med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-def : FPMed3Pat<f32, V_MED3_F32>;
-
-let Predicates = [isGFX9] in {
-def : FPMed3Pat<f16, V_MED3_F16>;
-def : IntMed3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>;
-def : IntMed3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>;
-} // End Predicates = [isGFX9]
+class FP16Med3Pat<ValueType vt,
+ Instruction med3Inst> : GCNPat<
+ (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+ (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+ (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+ (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+ (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
+ (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)
+>;
-//============================================================================//
-// Assembler aliases
-//============================================================================//
+class Int16Med3Pat<Instruction med3Inst,
+ SDPatternOperator max,
+ SDPatternOperator max_oneuse,
+ SDPatternOperator min_oneuse,
+ ValueType vt = i32> : GCNPat<
+ (max (min_oneuse vt:$src0, vt:$src1),
+ (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
+ (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
+>;
-def : MnemonicAlias<"v_add_u32", "v_add_i32">;
-def : MnemonicAlias<"v_sub_u32", "v_sub_i32">;
-def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">;
+def : FPMed3Pat<f32, V_MED3_F32>;
-} // End isGCN predicate
+let OtherPredicates = [isGFX9] in {
+def : FP16Med3Pat<f16, V_MED3_F16>;
+def : Int16Med3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>;
+def : Int16Med3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>;
+} // End Predicates = [isGFX9]
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index c6ad61a325cc..84cd47a101a8 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1,4 +1,4 @@
-//===-- SILoadStoreOptimizer.cpp ------------------------------------------===//
+//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -14,6 +14,12 @@
// ==>
// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
//
+// The same is done for certain SMEM and VMEM opcodes, e.g.:
+// s_buffer_load_dword s4, s[0:3], 4
+// s_buffer_load_dword s5, s[0:3], 8
+// ==>
+// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
+//
//
// Future improvements:
//
@@ -56,8 +62,9 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
#include <cassert>
+#include <cstdlib>
#include <iterator>
#include <utility>
@@ -68,31 +75,56 @@ using namespace llvm;
namespace {
class SILoadStoreOptimizer : public MachineFunctionPass {
-
- typedef struct {
+ enum InstClassEnum {
+ DS_READ_WRITE,
+ S_BUFFER_LOAD_IMM,
+ BUFFER_LOAD_OFFEN,
+ BUFFER_LOAD_OFFSET,
+ BUFFER_STORE_OFFEN,
+ BUFFER_STORE_OFFSET,
+ };
+
+ struct CombineInfo {
MachineBasicBlock::iterator I;
MachineBasicBlock::iterator Paired;
unsigned EltSize;
unsigned Offset0;
unsigned Offset1;
unsigned BaseOff;
+ InstClassEnum InstClass;
+ bool GLC0;
+ bool GLC1;
+ bool SLC0;
+ bool SLC1;
bool UseST64;
+ bool IsX2;
SmallVector<MachineInstr*, 8> InstsToMove;
- } CombineInfo;
+ };
private:
+ const SISubtarget *STM = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
MachineRegisterInfo *MRI = nullptr;
AliasAnalysis *AA = nullptr;
+ unsigned CreatedX2;
static bool offsetsCanBeCombined(CombineInfo &CI);
- bool findMatchingDSInst(CombineInfo &CI);
+ bool findMatchingInst(CombineInfo &CI);
+ unsigned read2Opcode(unsigned EltSize) const;
+ unsigned read2ST64Opcode(unsigned EltSize) const;
MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
+ unsigned write2Opcode(unsigned EltSize) const;
+ unsigned write2ST64Opcode(unsigned EltSize) const;
MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
+ MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
+ MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
+ unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
+ bool &IsOffen) const;
+ MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
public:
static char ID;
@@ -141,36 +173,35 @@ static void moveInstsAfter(MachineBasicBlock::iterator I,
}
}
-static void addDefsToList(const MachineInstr &MI,
- SmallVectorImpl<const MachineOperand *> &Defs) {
- for (const MachineOperand &Def : MI.defs()) {
- Defs.push_back(&Def);
- }
+static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
+ // XXX: Should this be looking for implicit defs?
+ for (const MachineOperand &Def : MI.defs())
+ Defs.insert(Def.getReg());
}
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
MachineBasicBlock::iterator B,
const SIInstrInfo *TII,
AliasAnalysis * AA) {
- return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) ||
- // RAW or WAR - cannot reorder
- // WAW - cannot reorder
- // RAR - safe to reorder
- !(A->mayStore() || B->mayStore()));
+ // RAW or WAR - cannot reorder
+ // WAW - cannot reorder
+ // RAR - safe to reorder
+ return !(A->mayStore() || B->mayStore()) ||
+ TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
}
// Add MI and its defs to the lists if MI reads one of the defs that are
// already in the list. Returns true in that case.
static bool
addToListsIfDependent(MachineInstr &MI,
- SmallVectorImpl<const MachineOperand *> &Defs,
+ DenseSet<unsigned> &Defs,
SmallVectorImpl<MachineInstr*> &Insts) {
- for (const MachineOperand *Def : Defs) {
- bool ReadDef = MI.readsVirtualRegister(Def->getReg());
- // If ReadDef is true, then there is a use of Def between I
- // and the instruction that I will potentially be merged with. We
- // will need to move this instruction after the merged instructions.
- if (ReadDef) {
+ for (MachineOperand &Use : MI.operands()) {
+ // If one of the defs is read, then there is a use of Def between I and the
+ // instruction that I will potentially be merged with. We will need to move
+ // this instruction after the merged instructions.
+
+ if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
Insts.push_back(&MI);
addDefsToList(MI, Defs);
return true;
@@ -211,6 +242,15 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
CI.UseST64 = false;
CI.BaseOff = 0;
+ // Handle SMEM and VMEM instructions.
+ if (CI.InstClass != DS_READ_WRITE) {
+ unsigned Diff = CI.IsX2 ? 2 : 1;
+ return (EltOffset0 + Diff == EltOffset1 ||
+ EltOffset1 + Diff == EltOffset0) &&
+ CI.GLC0 == CI.GLC1 &&
+ (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
+ }
+
// If the offset in elements doesn't fit in 8-bits, we might be able to use
// the stride 64 versions.
if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
@@ -248,30 +288,70 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
return false;
}
-bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {
- MachineBasicBlock::iterator E = CI.I->getParent()->end();
+bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator MBBI = CI.I;
+
+ unsigned AddrOpName[3] = {0};
+ int AddrIdx[3];
+ const MachineOperand *AddrReg[3];
+ unsigned NumAddresses = 0;
+
+ switch (CI.InstClass) {
+ case DS_READ_WRITE:
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
+ break;
+ case S_BUFFER_LOAD_IMM:
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
+ break;
+ case BUFFER_LOAD_OFFEN:
+ case BUFFER_STORE_OFFEN:
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
+ break;
+ case BUFFER_LOAD_OFFSET:
+ case BUFFER_STORE_OFFSET:
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
+ break;
+ }
+
+ for (unsigned i = 0; i < NumAddresses; i++) {
+ AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
+ AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
+
+ // We only ever merge operations with the same base address register, so don't
+ // bother scanning forward if there are no other uses.
+ if (AddrReg[i]->isReg() &&
+ (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
+ MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
+ return false;
+ }
+
++MBBI;
- SmallVector<const MachineOperand *, 8> DefsToMove;
+ DenseSet<unsigned> DefsToMove;
addDefsToList(*CI.I, DefsToMove);
for ( ; MBBI != E; ++MBBI) {
if (MBBI->getOpcode() != CI.I->getOpcode()) {
-
// This is not a matching DS instruction, but we can keep looking as
// long as one of these conditions are met:
// 1. It is safe to move I down past MBBI.
// 2. It is safe to move MBBI down past the instruction that I will
// be merged into.
- if (MBBI->hasUnmodeledSideEffects())
+ if (MBBI->hasUnmodeledSideEffects()) {
// We can't re-order this instruction with respect to other memory
- // opeations, so we fail both conditions mentioned above.
+ // operations, so we fail both conditions mentioned above.
return false;
+ }
if (MBBI->mayLoadOrStore() &&
- !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)) {
+ (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
+ !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
// We fail condition #1, but we may still be able to satisfy condition
// #2. Add this instruction to the move list and then we will check
// if condition #2 holds once we have selected the matching instruction.
@@ -300,21 +380,47 @@ bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {
if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
continue;
- int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
- AMDGPU::OpName::addr);
- const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx);
- const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
+ bool Match = true;
+ for (unsigned i = 0; i < NumAddresses; i++) {
+ const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
+
+ if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
+ if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
+ AddrReg[i]->getImm() != AddrRegNext.getImm()) {
+ Match = false;
+ break;
+ }
+ continue;
+ }
+
+ // Check same base pointer. Be careful of subregisters, which can occur with
+ // vectors of pointers.
+ if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
+ AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
+ Match = false;
+ break;
+ }
+ }
- // Check same base pointer. Be careful of subregisters, which can occur with
- // vectors of pointers.
- if (AddrReg0.getReg() == AddrReg1.getReg() &&
- AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
+ if (Match) {
int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
AMDGPU::OpName::offset);
- CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff;
- CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+ CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
+ CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
CI.Paired = MBBI;
+ if (CI.InstClass == DS_READ_WRITE) {
+ CI.Offset0 &= 0xffff;
+ CI.Offset1 &= 0xffff;
+ } else {
+ CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
+ CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
+ if (CI.InstClass != S_BUFFER_LOAD_IMM) {
+ CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
+ CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
+ }
+ }
+
// Check both offsets fit in the reduced range.
// We also need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
@@ -336,6 +442,20 @@ bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {
return false;
}
+unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
+ if (STM->ldsRequiresM0Init())
+ return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
+ return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
+}
+
+unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
+ if (STM->ldsRequiresM0Init())
+ return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
+
+ return (EltSize == 4) ?
+ AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
+}
+
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
@@ -349,12 +469,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
unsigned NewOffset0 = CI.Offset0;
unsigned NewOffset1 = CI.Offset1;
- unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32
- : AMDGPU::DS_READ2_B64;
-
- if (CI.UseST64)
- Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32
- : AMDGPU::DS_READ2ST64_B64;
+ unsigned Opc = CI.UseST64 ?
+ read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
@@ -382,9 +498,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
if (CI.BaseOff) {
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
- .addImm(CI.BaseOff)
- .addReg(AddrReg->getReg());
+
+ unsigned AddOpc = STM->hasAddNoCarry() ?
+ AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
+ BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
+ .addImm(CI.BaseOff)
+ .addReg(AddrReg->getReg());
}
MachineInstrBuilder Read2 =
@@ -417,6 +536,20 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
return Next;
}
+unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
+ if (STM->ldsRequiresM0Init())
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
+}
+
+unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
+ if (STM->ldsRequiresM0Init())
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
+
+ return (EltSize == 4) ?
+ AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
+}
+
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
@@ -430,12 +563,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
unsigned NewOffset0 = CI.Offset0;
unsigned NewOffset1 = CI.Offset1;
- unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32
- : AMDGPU::DS_WRITE2_B64;
-
- if (CI.UseST64)
- Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
- : AMDGPU::DS_WRITE2ST64_B64;
+ unsigned Opc = CI.UseST64 ?
+ write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
if (NewOffset0 > NewOffset1) {
// Canonicalize the merged instruction so the smaller offset comes first.
@@ -455,9 +584,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
if (CI.BaseOff) {
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
- .addImm(CI.BaseOff)
- .addReg(Addr->getReg());
+
+ unsigned AddOpc = STM->hasAddNoCarry() ?
+ AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
+ BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
+ .addImm(CI.BaseOff)
+ .addReg(Addr->getReg());
}
MachineInstrBuilder Write2 =
@@ -480,6 +612,194 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
return Next;
}
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
+ CombineInfo &CI) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+ unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
+ AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+
+ const TargetRegisterClass *SuperRC =
+ CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
+ unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+ unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
+
+ BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+ .addImm(MergedOffset) // offset
+ .addImm(CI.GLC0) // glc
+ .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+
+ unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+ unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+
+ // Handle descending offsets
+ if (CI.Offset0 > CI.Offset1)
+ std::swap(SubRegIdx0, SubRegIdx1);
+
+ // Copy to the old destination registers.
+ const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
+ const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
+
+ BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest0) // Copy to same destination including flags and sub reg.
+ .addReg(DestReg, 0, SubRegIdx0);
+ MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+ moveInstsAfter(Copy1, CI.InstsToMove);
+
+ MachineBasicBlock::iterator Next = std::next(CI.I);
+ CI.I->eraseFromParent();
+ CI.Paired->eraseFromParent();
+ return Next;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
+ CombineInfo &CI) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+ unsigned Opcode;
+
+ if (CI.InstClass == BUFFER_LOAD_OFFEN) {
+ Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
+ AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
+ } else {
+ Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
+ AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
+ }
+
+ const TargetRegisterClass *SuperRC =
+ CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+ unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+ unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
+
+ auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
+
+ if (CI.InstClass == BUFFER_LOAD_OFFEN)
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
+
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+ .addImm(MergedOffset) // offset
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.SLC0) // slc
+ .addImm(0) // tfe
+ .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+
+ unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+ unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+
+ // Handle descending offsets
+ if (CI.Offset0 > CI.Offset1)
+ std::swap(SubRegIdx0, SubRegIdx1);
+
+ // Copy to the old destination registers.
+ const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+ const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+
+ BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest0) // Copy to same destination including flags and sub reg.
+ .addReg(DestReg, 0, SubRegIdx0);
+ MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+ moveInstsAfter(Copy1, CI.InstsToMove);
+
+ MachineBasicBlock::iterator Next = std::next(CI.I);
+ CI.I->eraseFromParent();
+ CI.Paired->eraseFromParent();
+ return Next;
+}
+
+unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
+ const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
+ IsX2 = false;
+ IsOffen = false;
+
+ switch (I.getOpcode()) {
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+ IsOffen = true;
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
+ IsOffen = true;
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
+ IsX2 = true;
+ IsOffen = true;
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
+ IsX2 = true;
+ IsOffen = true;
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
+ IsX2 = true;
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
+ IsX2 = true;
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
+ }
+ return 0;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
+ CombineInfo &CI) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+ bool Unused1, Unused2;
+ unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
+
+ unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+ unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+
+ // Handle descending offsets
+ if (CI.Offset0 > CI.Offset1)
+ std::swap(SubRegIdx0, SubRegIdx1);
+
+ // Copy to the new source register.
+ const TargetRegisterClass *SuperRC =
+ CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+ unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
+
+ const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+ const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+
+ BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ .add(*Src0)
+ .addImm(SubRegIdx0)
+ .add(*Src1)
+ .addImm(SubRegIdx1);
+
+ auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
+ .addReg(SrcReg, RegState::Kill);
+
+ if (CI.InstClass == BUFFER_STORE_OFFEN)
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
+
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+ .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.SLC0) // slc
+ .addImm(0) // tfe
+ .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+
+ moveInstsAfter(MIB, CI.InstsToMove);
+
+ MachineBasicBlock::iterator Next = std::next(CI.I);
+ CI.I->eraseFromParent();
+ CI.Paired->eraseFromParent();
+ return Next;
+}
+
// Scan through looking for adjacent LDS operations with constant offsets from
// the same base register. We rely on the scheduler to do the hard work of
// clustering nearby loads, and assume these are all adjacent.
@@ -498,9 +818,14 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
CombineInfo CI;
CI.I = I;
unsigned Opc = MI.getOpcode();
- if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
- CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
- if (findMatchingDSInst(CI)) {
+ if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
+ Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
+
+ CI.InstClass = DS_READ_WRITE;
+ CI.EltSize =
+ (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
+
+ if (findMatchingInst(CI)) {
Modified = true;
I = mergeRead2Pair(CI);
} else {
@@ -508,9 +833,14 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
}
continue;
- } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
- CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
- if (findMatchingDSInst(CI)) {
+ } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
+ Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
+ Opc == AMDGPU::DS_WRITE_B64_gfx9) {
+ CI.InstClass = DS_READ_WRITE;
+ CI.EltSize
+ = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
+
+ if (findMatchingInst(CI)) {
Modified = true;
I = mergeWrite2Pair(CI);
} else {
@@ -519,6 +849,62 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
continue;
}
+ if (STM->hasSBufferLoadStoreAtomicDwordxN() &&
+ (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
+ Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
+ // EltSize is in units of the offset encoding.
+ CI.InstClass = S_BUFFER_LOAD_IMM;
+ CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
+ CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ if (findMatchingInst(CI)) {
+ Modified = true;
+ I = mergeSBufferLoadImmPair(CI);
+ if (!CI.IsX2)
+ CreatedX2++;
+ } else {
+ ++I;
+ }
+ continue;
+ }
+ if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
+ Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
+ Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
+ Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
+ if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
+ Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
+ CI.InstClass = BUFFER_LOAD_OFFEN;
+ else
+ CI.InstClass = BUFFER_LOAD_OFFSET;
+
+ CI.EltSize = 4;
+ CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
+ Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
+ if (findMatchingInst(CI)) {
+ Modified = true;
+ I = mergeBufferLoadPair(CI);
+ if (!CI.IsX2)
+ CreatedX2++;
+ } else {
+ ++I;
+ }
+ continue;
+ }
+
+ bool StoreIsX2, IsOffen;
+ if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
+ CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
+ CI.EltSize = 4;
+ CI.IsX2 = StoreIsX2;
+ if (findMatchingInst(CI)) {
+ Modified = true;
+ I = mergeBufferStorePair(CI);
+ if (!CI.IsX2)
+ CreatedX2++;
+ } else {
+ ++I;
+ }
+ continue;
+ }
++I;
}
@@ -527,25 +913,33 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
}
bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
- if (skipFunction(*MF.getFunction()))
+ if (skipFunction(MF.getFunction()))
return false;
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
- if (!STM.loadStoreOptEnabled())
+ STM = &MF.getSubtarget<SISubtarget>();
+ if (!STM->loadStoreOptEnabled())
return false;
- TII = STM.getInstrInfo();
+ TII = STM->getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ assert(MRI->isSSA() && "Must be run on SSA");
+
DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
bool Modified = false;
- for (MachineBasicBlock &MBB : MF)
+ for (MachineBasicBlock &MBB : MF) {
+ CreatedX2 = 0;
Modified |= optimizeBlock(MBB);
+ // Run again to convert x2 to x4.
+ if (CreatedX2 >= 1)
+ Modified |= optimizeBlock(MBB);
+ }
+
return Modified;
}
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 5f1c7f1fc42f..a9af83323976 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -21,31 +21,31 @@
/// EXEC to update the predicates.
///
/// For example:
-/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
-/// %SGPR0 = SI_IF %VCC
-/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
-/// %SGPR0 = SI_ELSE %SGPR0
-/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
-/// SI_END_CF %SGPR0
+/// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2
+/// %sgpr0 = SI_IF %vcc
+/// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0
+/// %sgpr0 = SI_ELSE %sgpr0
+/// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0
+/// SI_END_CF %sgpr0
///
/// becomes:
///
-/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask
-/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
+/// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc // Save and update the exec mask
+/// %sgpr0 = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask
/// S_CBRANCH_EXECZ label0 // This instruction is an optional
/// // optimization which allows us to
/// // branch if all the bits of
/// // EXEC are zero.
-/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
+/// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
///
/// label0:
-/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block
-/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
+/// %sgpr0 = S_OR_SAVEEXEC_B64 %exec // Restore the exec mask for the Then block
+/// %exec = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask
/// S_BRANCH_EXECZ label1 // Use our branch optimization
/// // instruction again.
-/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
+/// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block
/// label1:
-/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits
+/// %exec = S_OR_B64 %exec, %sgpr0 // Re-enable saved exec mask bits
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
@@ -53,7 +53,7 @@
#include "SIInstrInfo.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -63,9 +63,9 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
-#include "llvm/Target/TargetRegisterInfo.h"
#include <cassert>
#include <iterator>
@@ -134,6 +134,39 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
+static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
+ const SIInstrInfo *TII) {
+ unsigned SaveExecReg = MI.getOperand(0).getReg();
+ auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
+
+ if (U == MRI->use_instr_nodbg_end() ||
+ std::next(U) != MRI->use_instr_nodbg_end() ||
+ U->getOpcode() != AMDGPU::SI_END_CF)
+ return false;
+
+ // Check for SI_KILL_*_TERMINATOR on path from if to endif.
+ // if there is any such terminator simplififcations are not safe.
+ auto SMBB = MI.getParent();
+ auto EMBB = U->getParent();
+ DenseSet<const MachineBasicBlock*> Visited;
+ SmallVector<MachineBasicBlock*, 4> Worklist(SMBB->succ_begin(),
+ SMBB->succ_end());
+
+ while (!Worklist.empty()) {
+ MachineBasicBlock *MBB = Worklist.pop_back_val();
+
+ if (MBB == EMBB || !Visited.insert(MBB).second)
+ continue;
+ for(auto &Term : MBB->terminators())
+ if (TII->isKillTerminator(Term.getOpcode()))
+ return false;
+
+ Worklist.append(MBB->succ_begin(), MBB->succ_end());
+ }
+
+ return true;
+}
+
void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -149,9 +182,15 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineOperand &ImpDefSCC = MI.getOperand(4);
assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
+ // If there is only one use of save exec register and that use is SI_END_CF,
+ // we can optimize SI_IF by returning the full saved exec mask instead of
+ // just cleared bits.
+ bool SimpleIf = isSimpleIf(MI, MRI, TII);
+
// Add an implicit def of exec to discourage scheduling VALU after this which
// will interfere with trying to form s_and_saveexec_b64 later.
- unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned CopyReg = SimpleIf ? SaveExecReg
+ : MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
MachineInstr *CopyExec =
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
.addReg(AMDGPU::EXEC)
@@ -166,11 +205,14 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
.addReg(Cond.getReg());
setImpSCCDefDead(*And, true);
- MachineInstr *Xor =
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
- .addReg(Tmp)
- .addReg(CopyReg);
- setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
+ MachineInstr *Xor = nullptr;
+ if (!SimpleIf) {
+ Xor =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
+ .addReg(Tmp)
+ .addReg(CopyReg);
+ setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
+ }
// Use a copy that is a terminator to get correct spill code placement it with
// fast regalloc.
@@ -194,7 +236,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
// register.
LIS->ReplaceMachineInstrInMaps(MI, *And);
- LIS->InsertMachineInstrInMaps(*Xor);
+ if (!SimpleIf)
+ LIS->InsertMachineInstrInMaps(*Xor);
LIS->InsertMachineInstrInMaps(*SetExec);
LIS->InsertMachineInstrInMaps(*NewBr);
@@ -207,7 +250,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
LIS->removeInterval(SaveExecReg);
LIS->createAndComputeVirtRegInterval(SaveExecReg);
LIS->createAndComputeVirtRegInterval(Tmp);
- LIS->createAndComputeVirtRegInterval(CopyReg);
+ if (!SimpleIf)
+ LIS->createAndComputeVirtRegInterval(CopyReg);
}
void SILowerControlFlow::emitElse(MachineInstr &MI) {
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index ba616ada0c9c..da57b90dd8c4 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -17,7 +17,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -121,11 +121,14 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
}
}
+ unsigned int TmpSrc = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::COPY), TmpSrc)
+ .add(Src);
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
.add(Dst)
.addImm(0)
.addImm(-1)
- .add(Src);
+ .addReg(TmpSrc);
MI.eraseFromParent();
} else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
SrcRC == &AMDGPU::VReg_1RegClass) {
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index a7c8166ff6d2..6013ebc81d9f 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//===-- SIMachineFunctionInfo.cpp -------- SI Machine Function Info -------===//
+//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -8,13 +8,19 @@
//===----------------------------------------------------------------------===//
#include "SIMachineFunctionInfo.h"
+#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
+#include <cassert>
+#include <vector>
#define MAX_LANES 64
@@ -22,44 +28,8 @@ using namespace llvm;
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
- TIDReg(AMDGPU::NoRegister),
- ScratchRSrcReg(AMDGPU::PRIVATE_RSRC_REG),
- ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG),
- FrameOffsetReg(AMDGPU::FP_REG),
- StackPtrOffsetReg(AMDGPU::SP_REG),
- PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
- DispatchPtrUserSGPR(AMDGPU::NoRegister),
- QueuePtrUserSGPR(AMDGPU::NoRegister),
- KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
- DispatchIDUserSGPR(AMDGPU::NoRegister),
- FlatScratchInitUserSGPR(AMDGPU::NoRegister),
- PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
- GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
- GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
- GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
- WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
- WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
- WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
- WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
- PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
- WorkItemIDXVGPR(AMDGPU::NoRegister),
- WorkItemIDYVGPR(AMDGPU::NoRegister),
- WorkItemIDZVGPR(AMDGPU::NoRegister),
- PSInputAddr(0),
- PSInputEnable(0),
- ReturnsVoid(true),
- FlatWorkGroupSizes(0, 0),
- WavesPerEU(0, 0),
- DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}),
- DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}),
- LDSWaveSpillSize(0),
- NumUserSGPRs(0),
- NumSystemSGPRs(0),
- HasSpilledSGPRs(false),
- HasSpilledVGPRs(false),
- HasNonSpillStackObjects(false),
- NumSpilledSGPRs(0),
- NumSpilledVGPRs(0),
+ BufferPSV(*(MF.getSubtarget().getInstrInfo())),
+ ImagePSV(*(MF.getSubtarget().getInstrInfo())),
PrivateSegmentBuffer(false),
DispatchPtr(false),
QueuePtr(false),
@@ -77,11 +47,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkItemIDX(false),
WorkItemIDY(false),
WorkItemIDZ(false),
- ImplicitBufferPtr(false) {
+ ImplicitBufferPtr(false),
+ ImplicitArgPtr(false),
+ GITPtrHigh(0xffffffff) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- const Function *F = MF.getFunction();
- FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
- WavesPerEU = ST.getWavesPerEU(*F);
+ const Function &F = MF.getFunction();
+ FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
+ WavesPerEU = ST.getWavesPerEU(F);
if (!isEntryFunction()) {
// Non-entry functions have no special inputs for now, other registers
@@ -91,17 +63,26 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
FrameOffsetReg = AMDGPU::SGPR5;
StackPtrOffsetReg = AMDGPU::SGPR32;
- // FIXME: Not really a system SGPR.
- PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg;
+ ArgInfo.PrivateSegmentBuffer =
+ ArgDescriptor::createRegister(ScratchRSrcReg);
+ ArgInfo.PrivateSegmentWaveByteOffset =
+ ArgDescriptor::createRegister(ScratchWaveOffsetReg);
+
+ if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
+ ImplicitArgPtr = true;
+ } else {
+ if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
+ KernargSegmentPtr = true;
}
- CallingConv::ID CC = F->getCallingConv();
+ CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
- KernargSegmentPtr = !F->arg_empty();
+ if (!F.arg_empty())
+ KernargSegmentPtr = true;
WorkGroupIDX = true;
WorkItemIDX = true;
} else if (CC == CallingConv::AMDGPU_PS) {
- PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
+ PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
}
if (ST.debuggerEmitPrologue()) {
@@ -113,27 +94,27 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkItemIDY = true;
WorkItemIDZ = true;
} else {
- if (F->hasFnAttribute("amdgpu-work-group-id-x"))
+ if (F.hasFnAttribute("amdgpu-work-group-id-x"))
WorkGroupIDX = true;
- if (F->hasFnAttribute("amdgpu-work-group-id-y"))
+ if (F.hasFnAttribute("amdgpu-work-group-id-y"))
WorkGroupIDY = true;
- if (F->hasFnAttribute("amdgpu-work-group-id-z"))
+ if (F.hasFnAttribute("amdgpu-work-group-id-z"))
WorkGroupIDZ = true;
- if (F->hasFnAttribute("amdgpu-work-item-id-x"))
+ if (F.hasFnAttribute("amdgpu-work-item-id-x"))
WorkItemIDX = true;
- if (F->hasFnAttribute("amdgpu-work-item-id-y"))
+ if (F.hasFnAttribute("amdgpu-work-item-id-y"))
WorkItemIDY = true;
- if (F->hasFnAttribute("amdgpu-work-item-id-z"))
+ if (F.hasFnAttribute("amdgpu-work-item-id-z"))
WorkItemIDZ = true;
}
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
- bool MaySpill = ST.isVGPRSpillingEnabled(*F);
+ bool MaySpill = ST.isVGPRSpillingEnabled(F);
bool HasStackObjects = FrameInfo.hasStackObjects();
if (isEntryFunction()) {
@@ -145,10 +126,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (HasStackObjects || MaySpill) {
PrivateSegmentWaveByteOffset = true;
- // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
- if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
- (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
- PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
+ // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
+ (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
+ ArgInfo.PrivateSegmentWaveByteOffset
+ = ArgDescriptor::createRegister(AMDGPU::SGPR5);
}
}
@@ -157,78 +139,94 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (HasStackObjects || MaySpill)
PrivateSegmentBuffer = true;
- if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
+ if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
DispatchPtr = true;
- if (F->hasFnAttribute("amdgpu-queue-ptr"))
+ if (F.hasFnAttribute("amdgpu-queue-ptr"))
QueuePtr = true;
- if (F->hasFnAttribute("amdgpu-dispatch-id"))
+ if (F.hasFnAttribute("amdgpu-dispatch-id"))
DispatchID = true;
} else if (ST.isMesaGfxShader(MF)) {
if (HasStackObjects || MaySpill)
ImplicitBufferPtr = true;
}
- if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr"))
+ if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
KernargSegmentPtr = true;
if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) {
// TODO: This could be refined a lot. The attribute is a poor way of
// detecting calls that may require it before argument lowering.
- if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch"))
+ if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch"))
FlatScratchInit = true;
}
+
+ Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
+ StringRef S = A.getValueAsString();
+ if (!S.empty())
+ S.consumeInteger(0, GITPtrHigh);
}
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
const SIRegisterInfo &TRI) {
- PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+ ArgInfo.PrivateSegmentBuffer =
+ ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass));
NumUserSGPRs += 4;
- return PrivateSegmentBufferUserSGPR;
+ return ArgInfo.PrivateSegmentBuffer.getRegister();
}
unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
- DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
- return DispatchPtrUserSGPR;
+ return ArgInfo.DispatchPtr.getRegister();
}
unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
- QueuePtrUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
- return QueuePtrUserSGPR;
+ return ArgInfo.QueuePtr.getRegister();
}
unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
- KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ ArgInfo.KernargSegmentPtr
+ = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
- return KernargSegmentPtrUserSGPR;
+ return ArgInfo.KernargSegmentPtr.getRegister();
}
unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
- DispatchIDUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
- return DispatchIDUserSGPR;
+ return ArgInfo.DispatchID.getRegister();
}
unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
- FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
- return FlatScratchInitUserSGPR;
+ return ArgInfo.FlatScratchInit.getRegister();
}
unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
- ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
- return ImplicitBufferPtrUserSGPR;
+ return ArgInfo.ImplicitBufferPtr.getRegister();
+}
+
+static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
+ for (unsigned I = 0; CSRegs[I]; ++I) {
+ if (CSRegs[I] == Reg)
+ return true;
+ }
+
+ return false;
}
/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
@@ -252,6 +250,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
int NumLanes = Size / 4;
+ const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+
// Make sure to handle the case where a wide SGPR spill may span between two
// VGPRs.
for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
@@ -261,21 +261,28 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
if (VGPRIndex == 0) {
LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
if (LaneVGPR == AMDGPU::NoRegister) {
- // We have no VGPRs left for spilling SGPRs. Reset because we won't
+ // We have no VGPRs left for spilling SGPRs. Reset because we will not
// partially spill the SGPR to VGPRs.
SGPRToVGPRSpills.erase(FI);
NumVGPRSpillLanes -= I;
return false;
}
- SpillVGPRs.push_back(LaneVGPR);
+ Optional<int> CSRSpillFI;
+ if (FrameInfo.hasCalls() && CSRegs && isCalleeSavedReg(CSRegs, LaneVGPR)) {
+ // TODO: Should this be a CreateSpillStackObject? This is technically a
+ // weird CSR spill.
+ CSRSpillFI = FrameInfo.CreateStackObject(4, 4, false);
+ }
+
+ SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
// Add this register as live-in to all blocks to avoid machine verifer
// complaining about use of an undefined physical register.
for (MachineBasicBlock &BB : MF)
BB.addLiveIn(LaneVGPR);
} else {
- LaneVGPR = SpillVGPRs.back();
+ LaneVGPR = SpillVGPRs.back().VGPR;
}
SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 4c7f38a09a48..5dde72910ee3 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==//
+//==- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface --*- C++ -*-==//
//
// The LLVM Compiler Infrastructure
//
@@ -14,23 +14,32 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
#define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
+#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUMachineFunction.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIRegisterInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/ErrorHandling.h"
#include <array>
#include <cassert>
-#include <map>
#include <utility>
+#include <vector>
namespace llvm {
+class MachineFrameInfo;
+class MachineFunction;
+class TargetRegisterClass;
+
class AMDGPUImagePseudoSourceValue : public PseudoSourceValue {
public:
- explicit AMDGPUImagePseudoSourceValue() :
- PseudoSourceValue(PseudoSourceValue::TargetCustom) { }
+ explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) :
+ PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { }
bool isConstant(const MachineFrameInfo *) const override {
// This should probably be true for most images, but we will start by being
@@ -44,7 +53,7 @@ public:
return false;
}
- bool mayAlias(const MachineFrameInfo*) const override {
+ bool mayAlias(const MachineFrameInfo *) const override {
// FIXME: If we ever change image intrinsics to accept fat pointers, then
// this could be true for some cases.
return false;
@@ -53,8 +62,8 @@ public:
class AMDGPUBufferPseudoSourceValue : public PseudoSourceValue {
public:
- explicit AMDGPUBufferPseudoSourceValue() :
- PseudoSourceValue(PseudoSourceValue::TargetCustom) { }
+ explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII) :
+ PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { }
bool isConstant(const MachineFrameInfo *) const override {
// This should probably be true for most images, but we will start by being
@@ -68,7 +77,7 @@ public:
return false;
}
- bool mayAlias(const MachineFrameInfo*) const override {
+ bool mayAlias(const MachineFrameInfo *) const override {
// FIXME: If we ever change image intrinsics to accept fat pointers, then
// this could be true for some cases.
return false;
@@ -78,86 +87,68 @@ public:
/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
- // FIXME: This should be removed and getPreloadedValue moved here.
- friend class SIRegisterInfo;
-
- unsigned TIDReg;
+ unsigned TIDReg = AMDGPU::NoRegister;
// Registers that may be reserved for spilling purposes. These may be the same
// as the input registers.
- unsigned ScratchRSrcReg;
- unsigned ScratchWaveOffsetReg;
+ unsigned ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG;
+ unsigned ScratchWaveOffsetReg = AMDGPU::SCRATCH_WAVE_OFFSET_REG;
// This is the current function's incremented size from the kernel's scratch
// wave offset register. For an entry function, this is exactly the same as
// the ScratchWaveOffsetReg.
- unsigned FrameOffsetReg;
+ unsigned FrameOffsetReg = AMDGPU::FP_REG;
// Top of the stack SGPR offset derived from the ScratchWaveOffsetReg.
- unsigned StackPtrOffsetReg;
-
- // Input registers for non-HSA ABI
- unsigned ImplicitBufferPtrUserSGPR;
-
- // Input registers setup for the HSA ABI.
- // User SGPRs in allocation order.
- unsigned PrivateSegmentBufferUserSGPR;
- unsigned DispatchPtrUserSGPR;
- unsigned QueuePtrUserSGPR;
- unsigned KernargSegmentPtrUserSGPR;
- unsigned DispatchIDUserSGPR;
- unsigned FlatScratchInitUserSGPR;
- unsigned PrivateSegmentSizeUserSGPR;
- unsigned GridWorkGroupCountXUserSGPR;
- unsigned GridWorkGroupCountYUserSGPR;
- unsigned GridWorkGroupCountZUserSGPR;
-
- // System SGPRs in allocation order.
- unsigned WorkGroupIDXSystemSGPR;
- unsigned WorkGroupIDYSystemSGPR;
- unsigned WorkGroupIDZSystemSGPR;
- unsigned WorkGroupInfoSystemSGPR;
- unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
-
- // VGPR inputs. These are always v0, v1 and v2 for entry functions.
- unsigned WorkItemIDXVGPR;
- unsigned WorkItemIDYVGPR;
- unsigned WorkItemIDZVGPR;
+ unsigned StackPtrOffsetReg = AMDGPU::SP_REG;
- // Graphics info.
- unsigned PSInputAddr;
- unsigned PSInputEnable;
+ AMDGPUFunctionArgInfo ArgInfo;
- bool ReturnsVoid;
+ // Graphics info.
+ unsigned PSInputAddr = 0;
+ unsigned PSInputEnable = 0;
+
+ /// Number of bytes of arguments this function has on the stack. If the callee
+ /// is expected to restore the argument stack this should be a multiple of 16,
+ /// all usable during a tail call.
+ ///
+ /// The alternative would forbid tail call optimisation in some cases: if we
+ /// want to transfer control from a function with 8-bytes of stack-argument
+ /// space to a function with 16-bytes then misalignment of this value would
+ /// make a stack adjustment necessary, which could not be undone by the
+ /// callee.
+ unsigned BytesInStackArgArea = 0;
+
+ bool ReturnsVoid = true;
// A pair of default/requested minimum/maximum flat work group sizes.
// Minimum - first, maximum - second.
- std::pair<unsigned, unsigned> FlatWorkGroupSizes;
+ std::pair<unsigned, unsigned> FlatWorkGroupSizes = {0, 0};
// A pair of default/requested minimum/maximum number of waves per execution
// unit. Minimum - first, maximum - second.
- std::pair<unsigned, unsigned> WavesPerEU;
+ std::pair<unsigned, unsigned> WavesPerEU = {0, 0};
// Stack object indices for work group IDs.
- std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices;
+ std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices = {{0, 0, 0}};
+
// Stack object indices for work item IDs.
- std::array<int, 3> DebuggerWorkItemIDStackObjectIndices;
+ std::array<int, 3> DebuggerWorkItemIDStackObjectIndices = {{0, 0, 0}};
AMDGPUBufferPseudoSourceValue BufferPSV;
AMDGPUImagePseudoSourceValue ImagePSV;
private:
- unsigned LDSWaveSpillSize;
- unsigned ScratchOffsetReg;
- unsigned NumUserSGPRs;
- unsigned NumSystemSGPRs;
+ unsigned LDSWaveSpillSize = 0;
+ unsigned NumUserSGPRs = 0;
+ unsigned NumSystemSGPRs = 0;
- bool HasSpilledSGPRs;
- bool HasSpilledVGPRs;
- bool HasNonSpillStackObjects;
+ bool HasSpilledSGPRs = false;
+ bool HasSpilledVGPRs = false;
+ bool HasNonSpillStackObjects = false;
- unsigned NumSpilledSGPRs;
- unsigned NumSpilledVGPRs;
+ unsigned NumSpilledSGPRs = 0;
+ unsigned NumSpilledVGPRs = 0;
// Feature bits required for inputs passed in user SGPRs.
bool PrivateSegmentBuffer : 1;
@@ -186,6 +177,15 @@ private:
// Other shaders indirect 64-bits at sgpr[0:1]
bool ImplicitBufferPtr : 1;
+ // Pointer to where the ABI inserts special kernel arguments separate from the
+ // user arguments. This is an offset from the KernargSegmentPtr.
+ bool ImplicitArgPtr : 1;
+
+ // The hard-wired high half of the address of the global information table
+ // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since
+ // current hardware only allows a 16 bit value.
+ unsigned GITPtrHigh;
+
MCPhysReg getNextUserSGPR() const {
assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
return AMDGPU::SGPR0 + NumUserSGPRs;
@@ -201,24 +201,34 @@ public:
int Lane = -1;
SpilledReg() = default;
- SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { }
+ SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) {}
bool hasLane() { return Lane != -1;}
bool hasReg() { return VGPR != AMDGPU::NoRegister;}
};
+ struct SGPRSpillVGPRCSR {
+ // VGPR used for SGPR spills
+ unsigned VGPR;
+
+ // If the VGPR is a CSR, the stack slot used to save/restore it in the
+ // prolog/epilog.
+ Optional<int> FI;
+
+ SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {}
+ };
+
private:
// SGPR->VGPR spilling support.
- typedef std::pair<unsigned, unsigned> SpillRegMask;
+ using SpillRegMask = std::pair<unsigned, unsigned>;
// Track VGPR + wave index for each subregister of the SGPR spilled to
// frameindex key.
DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills;
unsigned NumVGPRSpillLanes = 0;
- SmallVector<unsigned, 2> SpillVGPRs;
+ SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;
public:
-
SIMachineFunctionInfo(const MachineFunction &MF);
ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
@@ -227,13 +237,25 @@ public:
ArrayRef<SpilledReg>() : makeArrayRef(I->second);
}
+ ArrayRef<SGPRSpillVGPRCSR> getSGPRSpillVGPRs() const {
+ return SpillVGPRs;
+ }
+
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
- bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
- unsigned getTIDReg() const { return TIDReg; };
+ bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }
+ unsigned getTIDReg() const { return TIDReg; }
void setTIDReg(unsigned Reg) { TIDReg = Reg; }
+ unsigned getBytesInStackArgArea() const {
+ return BytesInStackArgArea;
+ }
+
+ void setBytesInStackArgArea(unsigned Bytes) {
+ BytesInStackArgArea = Bytes;
+ }
+
// Add user SGPRs.
unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
unsigned addDispatchPtr(const SIRegisterInfo &TRI);
@@ -245,37 +267,51 @@ public:
// Add system SGPRs.
unsigned addWorkGroupIDX() {
- WorkGroupIDXSystemSGPR = getNextSystemSGPR();
+ ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
- return WorkGroupIDXSystemSGPR;
+ return ArgInfo.WorkGroupIDX.getRegister();
}
unsigned addWorkGroupIDY() {
- WorkGroupIDYSystemSGPR = getNextSystemSGPR();
+ ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
- return WorkGroupIDYSystemSGPR;
+ return ArgInfo.WorkGroupIDY.getRegister();
}
unsigned addWorkGroupIDZ() {
- WorkGroupIDZSystemSGPR = getNextSystemSGPR();
+ ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
- return WorkGroupIDZSystemSGPR;
+ return ArgInfo.WorkGroupIDZ.getRegister();
}
unsigned addWorkGroupInfo() {
- WorkGroupInfoSystemSGPR = getNextSystemSGPR();
+ ArgInfo.WorkGroupInfo = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
- return WorkGroupInfoSystemSGPR;
+ return ArgInfo.WorkGroupInfo.getRegister();
+ }
+
+ // Add special VGPR inputs
+ void setWorkItemIDX(ArgDescriptor Arg) {
+ ArgInfo.WorkItemIDX = Arg;
+ }
+
+ void setWorkItemIDY(ArgDescriptor Arg) {
+ ArgInfo.WorkItemIDY = Arg;
+ }
+
+ void setWorkItemIDZ(ArgDescriptor Arg) {
+ ArgInfo.WorkItemIDZ = Arg;
}
unsigned addPrivateSegmentWaveByteOffset() {
- PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
+ ArgInfo.PrivateSegmentWaveByteOffset
+ = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
- return PrivateSegmentWaveByteOffsetSystemSGPR;
+ return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
void setPrivateSegmentWaveByteOffset(unsigned Reg) {
- PrivateSegmentWaveByteOffsetSystemSGPR = Reg;
+ ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg);
}
bool hasPrivateSegmentBuffer() const {
@@ -346,10 +382,35 @@ public:
return WorkItemIDZ;
}
+ bool hasImplicitArgPtr() const {
+ return ImplicitArgPtr;
+ }
+
bool hasImplicitBufferPtr() const {
return ImplicitBufferPtr;
}
+ AMDGPUFunctionArgInfo &getArgInfo() {
+ return ArgInfo;
+ }
+
+ const AMDGPUFunctionArgInfo &getArgInfo() const {
+ return ArgInfo;
+ }
+
+ std::pair<const ArgDescriptor *, const TargetRegisterClass *>
+ getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
+ return ArgInfo.getPreloadedValue(Value);
+ }
+
+ unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
+ return ArgInfo.getPreloadedValue(Value).first->getRegister();
+ }
+
+ unsigned getGITPtrHigh() const {
+ return GITPtrHigh;
+ }
+
unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}
@@ -359,7 +420,7 @@ public:
}
unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
- return PrivateSegmentWaveByteOffsetSystemSGPR;
+ return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
/// \brief Returns the physical register reserved for use as the resource
@@ -401,11 +462,11 @@ public:
}
unsigned getQueuePtrUserSGPR() const {
- return QueuePtrUserSGPR;
+ return ArgInfo.QueuePtr.getRegister();
}
unsigned getImplicitBufferPtrUserSGPR() const {
- return ImplicitBufferPtrUserSGPR;
+ return ArgInfo.ImplicitBufferPtr.getRegister();
}
bool hasSpilledSGPRs() const {
@@ -537,13 +598,13 @@ public:
switch (Dim) {
case 0:
assert(hasWorkGroupIDX());
- return WorkGroupIDXSystemSGPR;
+ return ArgInfo.WorkGroupIDX.getRegister();
case 1:
assert(hasWorkGroupIDY());
- return WorkGroupIDYSystemSGPR;
+ return ArgInfo.WorkGroupIDY.getRegister();
case 2:
assert(hasWorkGroupIDZ());
- return WorkGroupIDZSystemSGPR;
+ return ArgInfo.WorkGroupIDZ.getRegister();
}
llvm_unreachable("unexpected dimension");
}
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 34886c48f461..6b67b76652ed 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -19,16 +19,16 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetRegisterInfo.h"
#include <algorithm>
#include <cassert>
#include <map>
@@ -595,11 +595,11 @@ void SIScheduleBlock::printDebug(bool full) {
<< LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
dbgs() << "LiveIns:\n";
for (unsigned Reg : LiveInRegs)
- dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
dbgs() << "\nLiveOuts:\n";
for (unsigned Reg : LiveOutRegs)
- dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
}
dbgs() << "\nInstructions:\n";
@@ -1130,6 +1130,62 @@ void SIScheduleBlockCreator::regroupNoUserInstructions() {
}
}
+void SIScheduleBlockCreator::colorExports() {
+ unsigned ExportColor = NextNonReservedID++;
+ SmallVector<unsigned, 8> ExpGroup;
+
+ // Put all exports together in a block.
+ // The block will naturally end up being scheduled last,
+ // thus putting exports at the end of the schedule, which
+ // is better for performance.
+ // However we must ensure, for safety, the exports can be put
+ // together in the same block without any other instruction.
+ // This could happen, for example, when scheduling after regalloc
+ // if reloading a spilled register from memory using the same
+ // register than used in a previous export.
+ // If that happens, do not regroup the exports.
+ for (unsigned SUNum : DAG->TopDownIndex2SU) {
+ const SUnit &SU = DAG->SUnits[SUNum];
+ if (SIInstrInfo::isEXP(*SU.getInstr())) {
+ // Check the EXP can be added to the group safely,
+ // ie without needing any other instruction.
+ // The EXP is allowed to depend on other EXP
+ // (they will be in the same group).
+ for (unsigned j : ExpGroup) {
+ bool HasSubGraph;
+ std::vector<int> SubGraph;
+ // By construction (topological order), if SU and
+ // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
+ // in the parent graph of SU.
+#ifndef NDEBUG
+ SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
+ HasSubGraph);
+ assert(!HasSubGraph);
+#endif
+ SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
+ HasSubGraph);
+ if (!HasSubGraph)
+ continue; // No dependencies between each other
+
+ // SubGraph contains all the instructions required
+ // between EXP SUnits[j] and EXP SU.
+ for (unsigned k : SubGraph) {
+ if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
+ // Other instructions than EXP would be required in the group.
+ // Abort the groupping.
+ return;
+ }
+ }
+
+ ExpGroup.push_back(SUNum);
+ }
+ }
+
+ // The group can be formed. Give the color.
+ for (unsigned j : ExpGroup)
+ CurrentColoring[j] = ExportColor;
+}
+
void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
unsigned DAGSize = DAG->SUnits.size();
std::map<unsigned,unsigned> RealID;
@@ -1159,6 +1215,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
regroupNoUserInstructions();
colorMergeConstantLoadsNextGroup();
colorMergeIfPossibleNextGroupOnlyForReserved();
+ colorExports();
// Put SUs of same color into same block
Node2CurrentBlock.resize(DAGSize, -1);
@@ -1365,8 +1422,8 @@ void SIScheduleBlockCreator::fillStats() {
else {
unsigned Depth = 0;
for (SIScheduleBlock *Pred : Block->getPreds()) {
- if (Depth < Pred->Depth + 1)
- Depth = Pred->Depth + 1;
+ if (Depth < Pred->Depth + Pred->getCost())
+ Depth = Pred->Depth + Pred->getCost();
}
Block->Depth = Depth;
}
@@ -1380,7 +1437,7 @@ void SIScheduleBlockCreator::fillStats() {
else {
unsigned Height = 0;
for (const auto &Succ : Block->getSuccs())
- Height = std::min(Height, Succ.first->Height + 1);
+ Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
Block->Height = Height;
}
}
@@ -1578,7 +1635,7 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
dbgs() << Block->getID() << ' ';
dbgs() << "\nCurrent Live:\n";
for (unsigned Reg : LiveRegs)
- dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
dbgs() << '\n';
dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
@@ -1993,9 +2050,9 @@ void SIScheduleDAGMI::schedule()
placeDebugValues();
DEBUG({
- unsigned BBNum = begin()->getParent()->getNumber();
- dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
- dumpSchedule();
- dbgs() << '\n';
- });
+ dbgs() << "*** Final schedule for "
+ << printMBBReference(*begin()->getParent()) << " ***\n";
+ dumpSchedule();
+ dbgs() << '\n';
+ });
}
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h
index 122d0f67ca8c..d824e38504e6 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -302,6 +302,9 @@ private:
// (we'd want these groups be at the end).
void regroupNoUserInstructions();
+ // Give Reserved color to export instructions
+ void colorExports();
+
void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
void topologicalSort();
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
new file mode 100644
index 000000000000..c73fb10b7ea0
--- /dev/null
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -0,0 +1,627 @@
+//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Memory legalizer - implements memory model. More information can be
+/// found here:
+/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMachineModuleInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include <cassert>
+#include <list>
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+#define DEBUG_TYPE "si-memory-legalizer"
+#define PASS_NAME "SI Memory Legalizer"
+
+namespace {
+
+class SIMemOpInfo final {
+private:
+ SyncScope::ID SSID = SyncScope::System;
+ AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
+ AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+ bool IsNonTemporal = false;
+
+ SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering)
+ : SSID(SSID), Ordering(Ordering) {}
+
+ SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering,
+ AtomicOrdering FailureOrdering, bool IsNonTemporal = false)
+ : SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering),
+ IsNonTemporal(IsNonTemporal) {}
+
+ /// \returns Info constructed from \p MI, which has at least machine memory
+ /// operand.
+ static Optional<SIMemOpInfo> constructFromMIWithMMO(
+ const MachineBasicBlock::iterator &MI);
+
+public:
+ /// \returns Synchronization scope ID of the machine instruction used to
+ /// create this SIMemOpInfo.
+ SyncScope::ID getSSID() const {
+ return SSID;
+ }
+ /// \returns Ordering constraint of the machine instruction used to
+ /// create this SIMemOpInfo.
+ AtomicOrdering getOrdering() const {
+ return Ordering;
+ }
+ /// \returns Failure ordering constraint of the machine instruction used to
+ /// create this SIMemOpInfo.
+ AtomicOrdering getFailureOrdering() const {
+ return FailureOrdering;
+ }
+ /// \returns True if memory access of the machine instruction used to
+ /// create this SIMemOpInfo is non-temporal, false otherwise.
+ bool isNonTemporal() const {
+ return IsNonTemporal;
+ }
+
+ /// \returns True if ordering constraint of the machine instruction used to
+ /// create this SIMemOpInfo is unordered or higher, false otherwise.
+ bool isAtomic() const {
+ return Ordering != AtomicOrdering::NotAtomic;
+ }
+
+ /// \returns Load info if \p MI is a load operation, "None" otherwise.
+ static Optional<SIMemOpInfo> getLoadInfo(
+ const MachineBasicBlock::iterator &MI);
+ /// \returns Store info if \p MI is a store operation, "None" otherwise.
+ static Optional<SIMemOpInfo> getStoreInfo(
+ const MachineBasicBlock::iterator &MI);
+ /// \returns Atomic fence info if \p MI is an atomic fence operation,
+ /// "None" otherwise.
+ static Optional<SIMemOpInfo> getAtomicFenceInfo(
+ const MachineBasicBlock::iterator &MI);
+ /// \returns Atomic cmpxchg info if \p MI is an atomic cmpxchg operation,
+ /// "None" otherwise.
+ static Optional<SIMemOpInfo> getAtomicCmpxchgInfo(
+ const MachineBasicBlock::iterator &MI);
+ /// \returns Atomic rmw info if \p MI is an atomic rmw operation,
+ /// "None" otherwise.
+ static Optional<SIMemOpInfo> getAtomicRmwInfo(
+ const MachineBasicBlock::iterator &MI);
+
+ /// \brief Reports unknown synchronization scope used in \p MI to LLVM
+ /// context.
+ static void reportUnknownSyncScope(
+ const MachineBasicBlock::iterator &MI);
+};
+
+class SIMemoryLegalizer final : public MachineFunctionPass {
+private:
+ /// \brief Machine module info.
+ const AMDGPUMachineModuleInfo *MMI = nullptr;
+
+ /// \brief Instruction info.
+ const SIInstrInfo *TII = nullptr;
+
+ /// \brief Immediate for "vmcnt(0)".
+ unsigned Vmcnt0Immediate = 0;
+
+ /// \brief Opcode for cache invalidation instruction (L1).
+ unsigned Wbinvl1Opcode = 0;
+
+ /// \brief List of atomic pseudo instructions.
+ std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
+
+ /// \brief Sets named bit (BitName) to "true" if present in \p MI. Returns
+ /// true if \p MI is modified, false otherwise.
+ template <uint16_t BitName>
+ bool enableNamedBit(const MachineBasicBlock::iterator &MI) const {
+ int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
+ if (BitIdx == -1)
+ return false;
+
+ MachineOperand &Bit = MI->getOperand(BitIdx);
+ if (Bit.getImm() != 0)
+ return false;
+
+ Bit.setImm(1);
+ return true;
+ }
+
+ /// \brief Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit<AMDGPU::OpName::glc>(MI);
+ }
+
+ /// \brief Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit<AMDGPU::OpName::slc>(MI);
+ }
+
+ /// \brief Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI.
+ /// Always returns true.
+ bool insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI,
+ bool Before = true) const;
+ /// \brief Inserts "s_waitcnt vmcnt(0)" instruction \p Before or after \p MI.
+ /// Always returns true.
+ bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
+ bool Before = true) const;
+
+ /// \brief Removes all processed atomic pseudo instructions from the current
+ /// function. Returns true if current function is modified, false otherwise.
+ bool removeAtomicPseudoMIs();
+
+ /// \brief Expands load operation \p MI. Returns true if instructions are
+ /// added/deleted or \p MI is modified, false otherwise.
+ bool expandLoad(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI);
+ /// \brief Expands store operation \p MI. Returns true if instructions are
+ /// added/deleted or \p MI is modified, false otherwise.
+ bool expandStore(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI);
+ /// \brief Expands atomic fence operation \p MI. Returns true if
+ /// instructions are added/deleted or \p MI is modified, false otherwise.
+ bool expandAtomicFence(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI);
+ /// \brief Expands atomic cmpxchg operation \p MI. Returns true if
+ /// instructions are added/deleted or \p MI is modified, false otherwise.
+ bool expandAtomicCmpxchg(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI);
+ /// \brief Expands atomic rmw operation \p MI. Returns true if
+ /// instructions are added/deleted or \p MI is modified, false otherwise.
+ bool expandAtomicRmw(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI);
+
+public:
+ static char ID;
+
+ SIMemoryLegalizer() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override {
+ return PASS_NAME;
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // end namespace anonymous
+
+/* static */
+Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO(
+ const MachineBasicBlock::iterator &MI) {
+ assert(MI->getNumMemOperands() > 0);
+
+ const MachineFunction *MF = MI->getParent()->getParent();
+ const AMDGPUMachineModuleInfo *MMI =
+ &MF->getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+
+ SyncScope::ID SSID = SyncScope::SingleThread;
+ AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
+ AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+ bool IsNonTemporal = true;
+
+ // Validator should check whether or not MMOs cover the entire set of
+ // locations accessed by the memory instruction.
+ for (const auto &MMO : MI->memoperands()) {
+ const auto &IsSyncScopeInclusion =
+ MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
+ if (!IsSyncScopeInclusion) {
+ reportUnknownSyncScope(MI);
+ return None;
+ }
+
+ SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
+ Ordering =
+ isStrongerThan(Ordering, MMO->getOrdering()) ?
+ Ordering : MMO->getOrdering();
+ FailureOrdering =
+ isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
+ FailureOrdering : MMO->getFailureOrdering();
+
+ if (!(MMO->getFlags() & MachineMemOperand::MONonTemporal))
+ IsNonTemporal = false;
+ }
+
+ return SIMemOpInfo(SSID, Ordering, FailureOrdering, IsNonTemporal);
+}
+
+/* static */
+Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo(
+ const MachineBasicBlock::iterator &MI) {
+ assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+
+ if (!(MI->mayLoad() && !MI->mayStore()))
+ return None;
+
+ // Be conservative if there are no memory operands.
+ if (MI->getNumMemOperands() == 0)
+ return SIMemOpInfo(SyncScope::System,
+ AtomicOrdering::SequentiallyConsistent);
+
+ return SIMemOpInfo::constructFromMIWithMMO(MI);
+}
+
+/* static */
+Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo(
+ const MachineBasicBlock::iterator &MI) {
+ assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+
+ if (!(!MI->mayLoad() && MI->mayStore()))
+ return None;
+
+ // Be conservative if there are no memory operands.
+ if (MI->getNumMemOperands() == 0)
+ return SIMemOpInfo(SyncScope::System,
+ AtomicOrdering::SequentiallyConsistent);
+
+ return SIMemOpInfo::constructFromMIWithMMO(MI);
+}
+
+/* static */
+Optional<SIMemOpInfo> SIMemOpInfo::getAtomicFenceInfo(
+ const MachineBasicBlock::iterator &MI) {
+ assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+
+ if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
+ return None;
+
+ SyncScope::ID SSID =
+ static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
+ AtomicOrdering Ordering =
+ static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
+ return SIMemOpInfo(SSID, Ordering);
+}
+
+/* static */
+Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgInfo(
+ const MachineBasicBlock::iterator &MI) {
+ assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+
+ if (!(MI->mayLoad() && MI->mayStore()))
+ return None;
+
+ // Be conservative if there are no memory operands.
+ if (MI->getNumMemOperands() == 0)
+ return SIMemOpInfo(SyncScope::System,
+ AtomicOrdering::SequentiallyConsistent,
+ AtomicOrdering::SequentiallyConsistent);
+
+ return SIMemOpInfo::constructFromMIWithMMO(MI);
+}
+
+/* static */
+Optional<SIMemOpInfo> SIMemOpInfo::getAtomicRmwInfo(
+ const MachineBasicBlock::iterator &MI) {
+ assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+
+ if (!(MI->mayLoad() && MI->mayStore()))
+ return None;
+
+ // Be conservative if there are no memory operands.
+ if (MI->getNumMemOperands() == 0)
+ return SIMemOpInfo(SyncScope::System,
+ AtomicOrdering::SequentiallyConsistent);
+
+ return SIMemOpInfo::constructFromMIWithMMO(MI);
+}
+
+/* static */
+void SIMemOpInfo::reportUnknownSyncScope(
+ const MachineBasicBlock::iterator &MI) {
+ DiagnosticInfoUnsupported Diag(MI->getParent()->getParent()->getFunction(),
+ "Unsupported synchronization scope");
+ LLVMContext *CTX = &MI->getParent()->getParent()->getFunction().getContext();
+ CTX->diagnose(Diag);
+}
+
+bool SIMemoryLegalizer::insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI,
+ bool Before) const {
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (!Before)
+ ++MI;
+
+ BuildMI(MBB, MI, DL, TII->get(Wbinvl1Opcode));
+
+ if (!Before)
+ --MI;
+
+ return true;
+}
+
+bool SIMemoryLegalizer::insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
+ bool Before) const {
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (!Before)
+ ++MI;
+
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Vmcnt0Immediate);
+
+ if (!Before)
+ --MI;
+
+ return true;
+}
+
+bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
+ if (AtomicPseudoMIs.empty())
+ return false;
+
+ for (auto &MI : AtomicPseudoMIs)
+ MI->eraseFromParent();
+
+ AtomicPseudoMIs.clear();
+ return true;
+}
+
+bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->mayLoad() && !MI->mayStore());
+
+ bool Changed = false;
+
+ if (MOI.isAtomic()) {
+ if (MOI.getSSID() == SyncScope::System ||
+ MOI.getSSID() == MMI->getAgentSSID()) {
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= enableGLCBit(MI);
+
+ if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= insertWaitcntVmcnt0(MI);
+
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |= insertWaitcntVmcnt0(MI, false);
+ Changed |= insertBufferWbinvl1Vol(MI, false);
+ }
+
+ return Changed;
+ }
+
+ if (MOI.getSSID() == SyncScope::SingleThread ||
+ MOI.getSSID() == MMI->getWorkgroupSSID() ||
+ MOI.getSSID() == MMI->getWavefrontSSID()) {
+ return Changed;
+ }
+
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+
+ // Atomic instructions do not have the nontemporal attribute.
+ if (MOI.isNonTemporal()) {
+ Changed |= enableGLCBit(MI);
+ Changed |= enableSLCBit(MI);
+ return Changed;
+ }
+
+ return Changed;
+}
+
+bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI) {
+ assert(!MI->mayLoad() && MI->mayStore());
+
+ bool Changed = false;
+
+ if (MOI.isAtomic()) {
+ if (MOI.getSSID() == SyncScope::System ||
+ MOI.getSSID() == MMI->getAgentSSID()) {
+ if (MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= insertWaitcntVmcnt0(MI);
+
+ return Changed;
+ }
+
+ if (MOI.getSSID() == SyncScope::SingleThread ||
+ MOI.getSSID() == MMI->getWorkgroupSSID() ||
+ MOI.getSSID() == MMI->getWavefrontSSID()) {
+ return Changed;
+ }
+
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+
+ // Atomic instructions do not have the nontemporal attribute.
+ if (MOI.isNonTemporal()) {
+ Changed |= enableGLCBit(MI);
+ Changed |= enableSLCBit(MI);
+ return Changed;
+ }
+
+ return Changed;
+}
+
+bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
+
+ bool Changed = false;
+
+ if (MOI.isAtomic()) {
+ if (MOI.getSSID() == SyncScope::System ||
+ MOI.getSSID() == MMI->getAgentSSID()) {
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= insertWaitcntVmcnt0(MI);
+
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= insertBufferWbinvl1Vol(MI);
+
+ AtomicPseudoMIs.push_back(MI);
+ return Changed;
+ }
+
+ if (MOI.getSSID() == SyncScope::SingleThread ||
+ MOI.getSSID() == MMI->getWorkgroupSSID() ||
+ MOI.getSSID() == MMI->getWavefrontSSID()) {
+ AtomicPseudoMIs.push_back(MI);
+ return Changed;
+ }
+
+ SIMemOpInfo::reportUnknownSyncScope(MI);
+ }
+
+ return Changed;
+}
+
+bool SIMemoryLegalizer::expandAtomicCmpxchg(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->mayLoad() && MI->mayStore());
+
+ bool Changed = false;
+
+ if (MOI.isAtomic()) {
+ if (MOI.getSSID() == SyncScope::System ||
+ MOI.getSSID() == MMI->getAgentSSID()) {
+ if (MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+ MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= insertWaitcntVmcnt0(MI);
+
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+ MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
+ MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |= insertWaitcntVmcnt0(MI, false);
+ Changed |= insertBufferWbinvl1Vol(MI, false);
+ }
+
+ return Changed;
+ }
+
+ if (MOI.getSSID() == SyncScope::SingleThread ||
+ MOI.getSSID() == MMI->getWorkgroupSSID() ||
+ MOI.getSSID() == MMI->getWavefrontSSID()) {
+ Changed |= enableGLCBit(MI);
+ return Changed;
+ }
+
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+
+ return Changed;
+}
+
+bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->mayLoad() && MI->mayStore());
+
+ bool Changed = false;
+
+ if (MOI.isAtomic()) {
+ if (MOI.getSSID() == SyncScope::System ||
+ MOI.getSSID() == MMI->getAgentSSID()) {
+ if (MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+ Changed |= insertWaitcntVmcnt0(MI);
+
+ if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |= insertWaitcntVmcnt0(MI, false);
+ Changed |= insertBufferWbinvl1Vol(MI, false);
+ }
+
+ return Changed;
+ }
+
+ if (MOI.getSSID() == SyncScope::SingleThread ||
+ MOI.getSSID() == MMI->getWorkgroupSSID() ||
+ MOI.getSSID() == MMI->getWavefrontSSID()) {
+ Changed |= enableGLCBit(MI);
+ return Changed;
+ }
+
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+
+ return Changed;
+}
+
+bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const IsaInfo::IsaVersion IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
+
+ MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+ TII = ST.getInstrInfo();
+
+ Vmcnt0Immediate =
+ AMDGPU::encodeWaitcnt(IV, 0, getExpcntBitMask(IV), getLgkmcntBitMask(IV));
+ Wbinvl1Opcode = ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS ?
+ AMDGPU::BUFFER_WBINVL1 : AMDGPU::BUFFER_WBINVL1_VOL;
+
+ for (auto &MBB : MF) {
+ for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
+ if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
+ continue;
+
+ if (const auto &MOI = SIMemOpInfo::getLoadInfo(MI))
+ Changed |= expandLoad(MOI.getValue(), MI);
+ else if (const auto &MOI = SIMemOpInfo::getStoreInfo(MI))
+ Changed |= expandStore(MOI.getValue(), MI);
+ else if (const auto &MOI = SIMemOpInfo::getAtomicFenceInfo(MI))
+ Changed |= expandAtomicFence(MOI.getValue(), MI);
+ else if (const auto &MOI = SIMemOpInfo::getAtomicCmpxchgInfo(MI))
+ Changed |= expandAtomicCmpxchg(MOI.getValue(), MI);
+ else if (const auto &MOI = SIMemOpInfo::getAtomicRmwInfo(MI))
+ Changed |= expandAtomicRmw(MOI.getValue(), MI);
+ }
+ }
+
+ Changed |= removeAtomicPseudoMIs();
+ return Changed;
+}
+
+INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
+
+char SIMemoryLegalizer::ID = 0;
+char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
+
+FunctionPass *llvm::createSIMemoryLegalizerPass() {
+ return new SIMemoryLegalizer();
+}
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 4d2f917278e9..2dc6f2702b3b 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -10,7 +10,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -87,6 +87,30 @@ static unsigned isCopyToExec(const MachineInstr &MI) {
return AMDGPU::NoRegister;
}
+/// If \p MI is a logical operation on an exec value,
+/// return the register copied to.
+static unsigned isLogicalOpOnExec(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_OR_B64:
+ case AMDGPU::S_XOR_B64:
+ case AMDGPU::S_ANDN2_B64:
+ case AMDGPU::S_ORN2_B64:
+ case AMDGPU::S_NAND_B64:
+ case AMDGPU::S_NOR_B64:
+ case AMDGPU::S_XNOR_B64: {
+ const MachineOperand &Src1 = MI.getOperand(1);
+ if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC)
+ return MI.getOperand(0).getReg();
+ const MachineOperand &Src2 = MI.getOperand(2);
+ if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC)
+ return MI.getOperand(0).getReg();
+ }
+ }
+
+ return AMDGPU::NoRegister;
+}
+
static unsigned getSaveExecOp(unsigned Opc) {
switch (Opc) {
case AMDGPU::S_AND_B64:
@@ -181,6 +205,9 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
}
bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -209,8 +236,24 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
// Scan backwards to find the def.
auto CopyToExecInst = &*I;
auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec);
- if (CopyFromExecInst == E)
+ if (CopyFromExecInst == E) {
+ auto PrepareExecInst = std::next(I);
+ if (PrepareExecInst == E)
+ continue;
+ // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
+ if (CopyToExecInst->getOperand(1).isKill() &&
+ isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
+ DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
+
+ PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC);
+
+ DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
+
+ CopyToExecInst->eraseFromParent();
+ }
+
continue;
+ }
if (isLiveOut(MBB, CopyToExec)) {
// The copied register is live out and has a second use in another block.
@@ -233,10 +276,12 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
break;
}
+ bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI);
+
if (J->modifiesRegister(CopyToExec, TRI)) {
if (SaveExecInst) {
DEBUG(dbgs() << "Multiple instructions modify "
- << PrintReg(CopyToExec, TRI) << '\n');
+ << printReg(CopyToExec, TRI) << '\n');
SaveExecInst = nullptr;
break;
}
@@ -245,7 +290,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
break;
- if (J->readsRegister(CopyFromExec, TRI)) {
+ if (ReadsCopyFromExec) {
SaveExecInst = &*J;
DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
continue;
@@ -253,6 +298,18 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n');
break;
}
+ } else if (ReadsCopyFromExec && !SaveExecInst) {
+ // Make sure no other instruction is trying to use this copy, before it
+ // will be rewritten by the saveexec, i.e. hasOneUse. There may have
+ // been another use, such as an inserted spill. For example:
+ //
+ // %sgpr0_sgpr1 = COPY %exec
+ // spill %sgpr0_sgpr1
+ // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
+ //
+ DEBUG(dbgs() << "Found second use of save inst candidate: "
+ << *J << '\n');
+ break;
}
if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) {
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
new file mode 100644
index 000000000000..83074773c495
--- /dev/null
+++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -0,0 +1,252 @@
+//===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass removes redundant S_OR_B64 instructions enabling lanes in
+/// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
+/// vector instructions between them we can only keep outer SI_END_CF, given
+/// that CFG is structured and exec bits of the outer end statement are always
+/// not less than exec bit of the inner one.
+///
+/// This needs to be done before the RA to eliminate saved exec bits registers
+/// but after register coalescer to have no vector registers copies in between
+/// of different end cf statements.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-optimize-exec-masking-pre-ra"
+
+namespace {
+
+class SIOptimizeExecMaskingPreRA : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) {
+ initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI optimize exec mask operations pre-RA";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRA, DEBUG_TYPE,
+ "SI optimize exec mask operations pre-RA", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRA, DEBUG_TYPE,
+ "SI optimize exec mask operations pre-RA", false, false)
+
+char SIOptimizeExecMaskingPreRA::ID = 0;
+
+char &llvm::SIOptimizeExecMaskingPreRAID = SIOptimizeExecMaskingPreRA::ID;
+
+FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
+ return new SIOptimizeExecMaskingPreRA();
+}
+
+static bool isEndCF(const MachineInstr& MI, const SIRegisterInfo* TRI) {
+ return MI.getOpcode() == AMDGPU::S_OR_B64 &&
+ MI.modifiesRegister(AMDGPU::EXEC, TRI);
+}
+
+static bool isFullExecCopy(const MachineInstr& MI) {
+ return MI.isFullCopy() && MI.getOperand(1).getReg() == AMDGPU::EXEC;
+}
+
+static unsigned getOrNonExecReg(const MachineInstr &MI,
+ const SIInstrInfo &TII) {
+ auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (Op->isReg() && Op->getReg() != AMDGPU::EXEC)
+ return Op->getReg();
+ Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0);
+ if (Op->isReg() && Op->getReg() != AMDGPU::EXEC)
+ return Op->getReg();
+ return AMDGPU::NoRegister;
+}
+
+static MachineInstr* getOrExecSource(const MachineInstr &MI,
+ const SIInstrInfo &TII,
+ const MachineRegisterInfo &MRI) {
+ auto SavedExec = getOrNonExecReg(MI, TII);
+ if (SavedExec == AMDGPU::NoRegister)
+ return nullptr;
+ auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec);
+ if (!SaveExecInst || !isFullExecCopy(*SaveExecInst))
+ return nullptr;
+ return SaveExecInst;
+}
+
+bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+ DenseSet<unsigned> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+
+ // Try to remove unneeded instructions before s_endpgm.
+ if (MBB.succ_empty()) {
+ if (MBB.empty() || MBB.back().getOpcode() != AMDGPU::S_ENDPGM)
+ continue;
+
+ SmallVector<MachineBasicBlock*, 4> Blocks({&MBB});
+
+ while (!Blocks.empty()) {
+ auto CurBB = Blocks.pop_back_val();
+ auto I = CurBB->rbegin(), E = CurBB->rend();
+ if (I != E) {
+ if (I->isUnconditionalBranch() || I->getOpcode() == AMDGPU::S_ENDPGM)
+ ++I;
+ else if (I->isBranch())
+ continue;
+ }
+
+ while (I != E) {
+ if (I->isDebugValue()) {
+ I = std::next(I);
+ continue;
+ }
+
+ if (I->mayStore() || I->isBarrier() || I->isCall() ||
+ I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef())
+ break;
+
+ DEBUG(dbgs() << "Removing no effect instruction: " << *I << '\n');
+
+ for (auto &Op : I->operands()) {
+ if (Op.isReg())
+ RecalcRegs.insert(Op.getReg());
+ }
+
+ auto Next = std::next(I);
+ LIS->RemoveMachineInstrFromMaps(*I);
+ I->eraseFromParent();
+ I = Next;
+
+ Changed = true;
+ }
+
+ if (I != E)
+ continue;
+
+ // Try to ascend predecessors.
+ for (auto *Pred : CurBB->predecessors()) {
+ if (Pred->succ_size() == 1)
+ Blocks.push_back(Pred);
+ }
+ }
+ continue;
+ }
+
+ // Try to collapse adjacent endifs.
+ auto Lead = MBB.begin(), E = MBB.end();
+ if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI))
+ continue;
+
+ const MachineBasicBlock* Succ = *MBB.succ_begin();
+ if (!MBB.isLayoutSuccessor(Succ))
+ continue;
+
+ auto I = std::next(Lead);
+
+ for ( ; I != E; ++I)
+ if (!TII->isSALU(*I) || I->readsRegister(AMDGPU::EXEC, TRI))
+ break;
+
+ if (I != E)
+ continue;
+
+ const auto NextLead = Succ->begin();
+ if (NextLead == Succ->end() || !isEndCF(*NextLead, TRI) ||
+ !getOrExecSource(*NextLead, *TII, MRI))
+ continue;
+
+ DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
+
+ auto SaveExec = getOrExecSource(*Lead, *TII, MRI);
+ unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII);
+ for (auto &Op : Lead->operands()) {
+ if (Op.isReg())
+ RecalcRegs.insert(Op.getReg());
+ }
+
+ LIS->RemoveMachineInstrFromMaps(*Lead);
+ Lead->eraseFromParent();
+ if (SaveExecReg) {
+ LIS->removeInterval(SaveExecReg);
+ LIS->createAndComputeVirtRegInterval(SaveExecReg);
+ }
+
+ Changed = true;
+
+ // If the only use of saved exec in the removed instruction is S_AND_B64
+ // fold the copy now.
+ if (!SaveExec || !SaveExec->isFullCopy())
+ continue;
+
+ unsigned SavedExec = SaveExec->getOperand(0).getReg();
+ bool SafeToReplace = true;
+ for (auto& U : MRI.use_nodbg_instructions(SavedExec)) {
+ if (U.getParent() != SaveExec->getParent()) {
+ SafeToReplace = false;
+ break;
+ }
+
+ DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
+ }
+
+ if (SafeToReplace) {
+ LIS->RemoveMachineInstrFromMaps(*SaveExec);
+ SaveExec->eraseFromParent();
+ MRI.replaceRegWith(SavedExec, AMDGPU::EXEC);
+ LIS->removeInterval(SavedExec);
+ }
+ }
+
+ if (Changed) {
+ for (auto Reg : RecalcRegs) {
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ LIS->removeInterval(Reg);
+ if (!MRI.reg_empty(Reg))
+ LIS->createAndComputeVirtRegInterval(Reg);
+ } else {
+ for (MCRegUnitIterator U(Reg, TRI); U.isValid(); ++U)
+ LIS->removeRegUnit(*U);
+ }
+ }
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index e2ac6631d2f3..5ed7fdf220bf 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1,4 +1,4 @@
-//===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===//
+//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
//
// The LLVM Compiler Infrastructure
//
@@ -10,12 +10,12 @@
/// \file This pass tries to apply several peephole SDWA patterns.
///
/// E.g. original:
-/// V_LSHRREV_B32_e32 %vreg0, 16, %vreg1
-/// V_ADD_I32_e32 %vreg2, %vreg0, %vreg3
-/// V_LSHLREV_B32_e32 %vreg4, 16, %vreg2
+/// V_LSHRREV_B32_e32 %0, 16, %1
+/// V_ADD_I32_e32 %2, %0, %3
+/// V_LSHLREV_B32_e32 %4, 16, %2
///
/// Replace:
-/// V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3
+/// V_ADD_I32_sdwa %4, %1, %3
/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
///
//===----------------------------------------------------------------------===//
@@ -24,12 +24,31 @@
#include "AMDGPUSubtarget.h"
#include "SIDefines.h"
#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
#include <unordered_map>
-#include <unordered_set>
using namespace llvm;
@@ -42,10 +61,11 @@ STATISTIC(NumSDWAInstructionsPeepholed,
namespace {
class SDWAOperand;
+class SDWADstOperand;
class SIPeepholeSDWA : public MachineFunctionPass {
public:
- typedef SmallVector<SDWAOperand *, 4> SDWAOperandsVector;
+ using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
private:
MachineRegisterInfo *MRI;
@@ -67,6 +87,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineFunction &MF);
+ std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
@@ -91,7 +112,7 @@ public:
assert(Replaced->isReg());
}
- virtual ~SDWAOperand() {}
+ virtual ~SDWAOperand() = default;
virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
@@ -99,9 +120,15 @@ public:
MachineOperand *getTargetOperand() const { return Target; }
MachineOperand *getReplacedOperand() const { return Replaced; }
MachineInstr *getParentInst() const { return Target->getParent(); }
+
MachineRegisterInfo *getMRI() const {
return &getParentInst()->getParent()->getParent()->getRegInfo();
}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ virtual void print(raw_ostream& OS) const = 0;
+ void dump() const { print(dbgs()); }
+#endif
};
using namespace AMDGPU::SDWA;
@@ -117,11 +144,11 @@ public:
SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
bool Sext_ = false)
- : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
- Neg(Neg_), Sext(Sext_) {}
+ : SDWAOperand(TargetOp, ReplacedOp),
+ SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
- virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
- virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+ MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+ bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
SdwaSel getSrcSel() const { return SrcSel; }
bool getAbs() const { return Abs; }
@@ -130,6 +157,10 @@ public:
uint64_t getSrcMods(const SIInstrInfo *TII,
const MachineOperand *SrcOp) const;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void print(raw_ostream& OS) const override;
+#endif
};
class SDWADstOperand : public SDWAOperand {
@@ -138,18 +169,42 @@ private:
DstUnused DstUn;
public:
+
SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
- : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
+ : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
- virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
- virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+ MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+ bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
SdwaSel getDstSel() const { return DstSel; }
DstUnused getDstUnused() const { return DstUn; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void print(raw_ostream& OS) const override;
+#endif
+};
+
+class SDWADstPreserveOperand : public SDWADstOperand {
+private:
+ MachineOperand *Preserve;
+
+public:
+ SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
+ MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
+ : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
+ Preserve(PreserveOp) {}
+
+ bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+
+ MachineOperand *getPreservedOperand() const { return Preserve; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void print(raw_ostream& OS) const override;
+#endif
};
-} // End anonymous namespace.
+} // end anonymous namespace
INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
@@ -161,8 +216,8 @@ FunctionPass *llvm::createSIPeepholeSDWAPass() {
return new SIPeepholeSDWA();
}
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) {
switch(Sel) {
case BYTE_0: OS << "BYTE_0"; break;
@@ -185,19 +240,31 @@ static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
return OS;
}
-static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) {
- OS << "SDWA src: " << *Src.getTargetOperand()
- << " src_sel:" << Src.getSrcSel()
- << " abs:" << Src.getAbs() << " neg:" << Src.getNeg()
- << " sext:" << Src.getSext() << '\n';
+static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
+ Operand.print(OS);
return OS;
}
-static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) {
- OS << "SDWA dst: " << *Dst.getTargetOperand()
- << " dst_sel:" << Dst.getDstSel()
- << " dst_unused:" << Dst.getDstUnused() << '\n';
- return OS;
+LLVM_DUMP_METHOD
+void SDWASrcOperand::print(raw_ostream& OS) const {
+ OS << "SDWA src: " << *getTargetOperand()
+ << " src_sel:" << getSrcSel()
+ << " abs:" << getAbs() << " neg:" << getNeg()
+ << " sext:" << getSext() << '\n';
+}
+
+LLVM_DUMP_METHOD
+void SDWADstOperand::print(raw_ostream& OS) const {
+ OS << "SDWA dst: " << *getTargetOperand()
+ << " dst_sel:" << getDstSel()
+ << " dst_unused:" << getDstUnused() << '\n';
+}
+
+LLVM_DUMP_METHOD
+void SDWADstPreserveOperand::print(raw_ostream& OS) const {
+ OS << "SDWA preserve dst: " << *getTargetOperand()
+ << " dst_sel:" << getDstSel()
+ << " preserve:" << *getPreservedOperand() << '\n';
}
#endif
@@ -221,23 +288,44 @@ static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
LHS.getSubReg() == RHS.getSubReg();
}
-static bool isSubregOf(const MachineOperand &SubReg,
- const MachineOperand &SuperReg,
- const TargetRegisterInfo *TRI) {
+static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
+ const MachineRegisterInfo *MRI) {
+ if (!Reg->isReg() || !Reg->isDef())
+ return nullptr;
- if (!SuperReg.isReg() || !SubReg.isReg())
- return false;
+ MachineOperand *ResMO = nullptr;
+ for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
+ // If there exist use of subreg of Reg then return nullptr
+ if (!isSameReg(UseMO, *Reg))
+ return nullptr;
- if (isSameReg(SuperReg, SubReg))
- return true;
+ // Check that there is only one instruction that uses Reg
+ if (!ResMO) {
+ ResMO = &UseMO;
+ } else if (ResMO->getParent() != UseMO.getParent()) {
+ return nullptr;
+ }
+ }
- if (SuperReg.getReg() != SubReg.getReg())
- return false;
+ return ResMO;
+}
+
+static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
+ const MachineRegisterInfo *MRI) {
+ if (!Reg->isReg())
+ return nullptr;
+
+ MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
+ if (!DefInstr)
+ return nullptr;
+
+ for (auto &DefMO : DefInstr->defs()) {
+ if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
+ return &DefMO;
+ }
- LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg());
- LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg());
- SuperMask |= ~SubMask;
- return SuperMask.all();
+ // Ignore implicit defs.
+ return nullptr;
}
uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
@@ -268,30 +356,11 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
// For SDWA src operand potential instruction is one that use register
// defined by parent instruction
- MachineRegisterInfo *MRI = getMRI();
- MachineOperand *Replaced = getReplacedOperand();
- assert(Replaced->isReg());
-
- MachineInstr *PotentialMI = nullptr;
- for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) {
- // If this is use of another subreg of dst reg then do nothing
- if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
- continue;
+ MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
+ if (!PotentialMO)
+ return nullptr;
- // If there exist use of superreg of dst then we should not combine this
- // opernad
- if (!isSameReg(PotentialMO, *Replaced))
- return nullptr;
-
- // Check that PotentialMI is only instruction that uses dst reg
- if (PotentialMI == nullptr) {
- PotentialMI = PotentialMO.getParent();
- } else if (PotentialMI != PotentialMO.getParent()) {
- return nullptr;
- }
- }
-
- return PotentialMI;
+ return PotentialMO->getParent();
}
bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
@@ -313,7 +382,7 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
- !isSameReg(*Src, *getReplacedOperand())) {
+ !isSameReg(*Src, *getReplacedOperand())) {
// In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
// src2. This is not allowed.
return false;
@@ -333,29 +402,18 @@ MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
// that this operand uses
MachineRegisterInfo *MRI = getMRI();
MachineInstr *ParentMI = getParentInst();
- MachineOperand *Replaced = getReplacedOperand();
- assert(Replaced->isReg());
- for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) {
- if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
- continue;
+ MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
+ if (!PotentialMO)
+ return nullptr;
- if (!isSameReg(*Replaced, PotentialMO))
+ // Check that ParentMI is the only instruction that uses replaced register
+ for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
+ if (&UseInst != ParentMI)
return nullptr;
-
- // Check that ParentMI is the only instruction that uses replaced register
- for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) {
- if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) &&
- UseMO.getParent() != ParentMI) {
- return nullptr;
- }
- }
-
- // Due to SSA this should be onle def of replaced register, so return it
- return PotentialMO.getParent();
}
- return nullptr;
+ return PotentialMO->getParent();
}
bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
@@ -386,13 +444,43 @@ bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
return true;
}
+bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
+ const SIInstrInfo *TII) {
+ // MI should be moved right before v_or_b32.
+ // For this we should clear all kill flags on uses of MI src-operands or else
+ // we can encounter problem with use of killed operand.
+ for (MachineOperand &MO : MI.uses()) {
+ if (!MO.isReg())
+ continue;
+ getMRI()->clearKillFlags(MO.getReg());
+ }
+
+ // Move MI before v_or_b32
+ auto MBB = MI.getParent();
+ MBB->remove(&MI);
+ MBB->insert(getParentInst(), &MI);
+
+ // Add Implicit use of preserved register
+ MachineInstrBuilder MIB(*MBB->getParent(), MI);
+ MIB.addReg(getPreservedOperand()->getReg(),
+ RegState::ImplicitKill,
+ getPreservedOperand()->getSubReg());
+
+ // Tie dst to implicit use
+ MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
+ MI.getNumOperands() - 1);
+
+ // Convert MI as any other SDWADstOperand and remove v_or_b32
+ return SDWADstOperand::convertToSDWA(MI, TII);
+}
+
Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
if (Op.isImm()) {
return Op.getImm();
}
// If this is not immediate then it can be copy of immediate value, e.g.:
- // %vreg1<def> = S_MOV_B32 255;
+ // %1 = S_MOV_B32 255;
if (Op.isReg()) {
for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
if (!isSameReg(Op, Def))
@@ -413,195 +501,316 @@ Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
return None;
}
-void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- case AMDGPU::V_LSHRREV_B32_e32:
- case AMDGPU::V_ASHRREV_I32_e32:
- case AMDGPU::V_LSHLREV_B32_e32:
- case AMDGPU::V_LSHRREV_B32_e64:
- case AMDGPU::V_ASHRREV_I32_e64:
- case AMDGPU::V_LSHLREV_B32_e64: {
- // from: v_lshrrev_b32_e32 v1, 16/24, v0
- // to SDWA src:v0 src_sel:WORD_1/BYTE_3
-
- // from: v_ashrrev_i32_e32 v1, 16/24, v0
- // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
-
- // from: v_lshlrev_b32_e32 v1, 16/24, v0
- // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
- MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- auto Imm = foldToImm(*Src0);
- if (!Imm)
- break;
-
- if (*Imm != 16 && *Imm != 24)
- break;
-
- MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (TRI->isPhysicalRegister(Src1->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
- break;
-
- if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
- Opcode == AMDGPU::V_LSHLREV_B32_e64) {
- auto SDWADst = make_unique<SDWADstOperand>(
- Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
- DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
- SDWAOperands[&MI] = std::move(SDWADst);
- ++NumSDWAPatternsFound;
- } else {
- auto SDWASrc = make_unique<SDWASrcOperand>(
- Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
- Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
- Opcode != AMDGPU::V_LSHRREV_B32_e64);
- DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
- SDWAOperands[&MI] = std::move(SDWASrc);
- ++NumSDWAPatternsFound;
- }
- break;
- }
+std::unique_ptr<SDWAOperand>
+SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::V_LSHRREV_B32_e32:
+ case AMDGPU::V_ASHRREV_I32_e32:
+ case AMDGPU::V_LSHLREV_B32_e32:
+ case AMDGPU::V_LSHRREV_B32_e64:
+ case AMDGPU::V_ASHRREV_I32_e64:
+ case AMDGPU::V_LSHLREV_B32_e64: {
+ // from: v_lshrrev_b32_e32 v1, 16/24, v0
+ // to SDWA src:v0 src_sel:WORD_1/BYTE_3
+
+ // from: v_ashrrev_i32_e32 v1, 16/24, v0
+ // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
+
+ // from: v_lshlrev_b32_e32 v1, 16/24, v0
+ // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ auto Imm = foldToImm(*Src0);
+ if (!Imm)
+ break;
+
+ if (*Imm != 16 && *Imm != 24)
+ break;
+
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ if (TRI->isPhysicalRegister(Src1->getReg()) ||
+ TRI->isPhysicalRegister(Dst->getReg()))
+ break;
+
+ if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
+ Opcode == AMDGPU::V_LSHLREV_B32_e64) {
+ return make_unique<SDWADstOperand>(
+ Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
+ } else {
+ return make_unique<SDWASrcOperand>(
+ Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
+ Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
+ Opcode != AMDGPU::V_LSHRREV_B32_e64);
+ }
+ break;
+ }
- case AMDGPU::V_LSHRREV_B16_e32:
- case AMDGPU::V_ASHRREV_I16_e32:
- case AMDGPU::V_LSHLREV_B16_e32:
- case AMDGPU::V_LSHRREV_B16_e64:
- case AMDGPU::V_ASHRREV_I16_e64:
- case AMDGPU::V_LSHLREV_B16_e64: {
- // from: v_lshrrev_b16_e32 v1, 8, v0
- // to SDWA src:v0 src_sel:BYTE_1
-
- // from: v_ashrrev_i16_e32 v1, 8, v0
- // to SDWA src:v0 src_sel:BYTE_1 sext:1
-
- // from: v_lshlrev_b16_e32 v1, 8, v0
- // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
- MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- auto Imm = foldToImm(*Src0);
- if (!Imm || *Imm != 8)
- break;
-
- MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-
- if (TRI->isPhysicalRegister(Src1->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
- break;
-
- if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
- Opcode == AMDGPU::V_LSHLREV_B16_e64) {
- auto SDWADst =
- make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
- DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
- SDWAOperands[&MI] = std::move(SDWADst);
- ++NumSDWAPatternsFound;
- } else {
- auto SDWASrc = make_unique<SDWASrcOperand>(
- Src1, Dst, BYTE_1, false, false,
- Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
- Opcode != AMDGPU::V_LSHRREV_B16_e64);
- DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
- SDWAOperands[&MI] = std::move(SDWASrc);
- ++NumSDWAPatternsFound;
- }
- break;
- }
+ case AMDGPU::V_LSHRREV_B16_e32:
+ case AMDGPU::V_ASHRREV_I16_e32:
+ case AMDGPU::V_LSHLREV_B16_e32:
+ case AMDGPU::V_LSHRREV_B16_e64:
+ case AMDGPU::V_ASHRREV_I16_e64:
+ case AMDGPU::V_LSHLREV_B16_e64: {
+ // from: v_lshrrev_b16_e32 v1, 8, v0
+ // to SDWA src:v0 src_sel:BYTE_1
+
+ // from: v_ashrrev_i16_e32 v1, 8, v0
+ // to SDWA src:v0 src_sel:BYTE_1 sext:1
+
+ // from: v_lshlrev_b16_e32 v1, 8, v0
+ // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ auto Imm = foldToImm(*Src0);
+ if (!Imm || *Imm != 8)
+ break;
+
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+ if (TRI->isPhysicalRegister(Src1->getReg()) ||
+ TRI->isPhysicalRegister(Dst->getReg()))
+ break;
+
+ if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
+ Opcode == AMDGPU::V_LSHLREV_B16_e64) {
+ return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
+ } else {
+ return make_unique<SDWASrcOperand>(
+ Src1, Dst, BYTE_1, false, false,
+ Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
+ Opcode != AMDGPU::V_LSHRREV_B16_e64);
+ }
+ break;
+ }
- case AMDGPU::V_BFE_I32:
- case AMDGPU::V_BFE_U32: {
- // e.g.:
- // from: v_bfe_u32 v1, v0, 8, 8
- // to SDWA src:v0 src_sel:BYTE_1
-
- // offset | width | src_sel
- // ------------------------
- // 0 | 8 | BYTE_0
- // 0 | 16 | WORD_0
- // 0 | 32 | DWORD ?
- // 8 | 8 | BYTE_1
- // 16 | 8 | BYTE_2
- // 16 | 16 | WORD_1
- // 24 | 8 | BYTE_3
-
- MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- auto Offset = foldToImm(*Src1);
- if (!Offset)
- break;
-
- MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
- auto Width = foldToImm(*Src2);
- if (!Width)
- break;
-
- SdwaSel SrcSel = DWORD;
-
- if (*Offset == 0 && *Width == 8)
- SrcSel = BYTE_0;
- else if (*Offset == 0 && *Width == 16)
- SrcSel = WORD_0;
- else if (*Offset == 0 && *Width == 32)
- SrcSel = DWORD;
- else if (*Offset == 8 && *Width == 8)
- SrcSel = BYTE_1;
- else if (*Offset == 16 && *Width == 8)
- SrcSel = BYTE_2;
- else if (*Offset == 16 && *Width == 16)
- SrcSel = WORD_1;
- else if (*Offset == 24 && *Width == 8)
- SrcSel = BYTE_3;
- else
- break;
-
- MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-
- if (TRI->isPhysicalRegister(Src0->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
- break;
-
- auto SDWASrc = make_unique<SDWASrcOperand>(
- Src0, Dst, SrcSel, false, false,
- Opcode == AMDGPU::V_BFE_U32 ? false : true);
- DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
- SDWAOperands[&MI] = std::move(SDWASrc);
- ++NumSDWAPatternsFound;
+ case AMDGPU::V_BFE_I32:
+ case AMDGPU::V_BFE_U32: {
+ // e.g.:
+ // from: v_bfe_u32 v1, v0, 8, 8
+ // to SDWA src:v0 src_sel:BYTE_1
+
+ // offset | width | src_sel
+ // ------------------------
+ // 0 | 8 | BYTE_0
+ // 0 | 16 | WORD_0
+ // 0 | 32 | DWORD ?
+ // 8 | 8 | BYTE_1
+ // 16 | 8 | BYTE_2
+ // 16 | 16 | WORD_1
+ // 24 | 8 | BYTE_3
+
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ auto Offset = foldToImm(*Src1);
+ if (!Offset)
+ break;
+
+ MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ auto Width = foldToImm(*Src2);
+ if (!Width)
+ break;
+
+ SdwaSel SrcSel = DWORD;
+
+ if (*Offset == 0 && *Width == 8)
+ SrcSel = BYTE_0;
+ else if (*Offset == 0 && *Width == 16)
+ SrcSel = WORD_0;
+ else if (*Offset == 0 && *Width == 32)
+ SrcSel = DWORD;
+ else if (*Offset == 8 && *Width == 8)
+ SrcSel = BYTE_1;
+ else if (*Offset == 16 && *Width == 8)
+ SrcSel = BYTE_2;
+ else if (*Offset == 16 && *Width == 16)
+ SrcSel = WORD_1;
+ else if (*Offset == 24 && *Width == 8)
+ SrcSel = BYTE_3;
+ else
+ break;
+
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+ if (TRI->isPhysicalRegister(Src0->getReg()) ||
+ TRI->isPhysicalRegister(Dst->getReg()))
+ break;
+
+ return make_unique<SDWASrcOperand>(
+ Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
+ }
+
+ case AMDGPU::V_AND_B32_e32:
+ case AMDGPU::V_AND_B32_e64: {
+ // e.g.:
+ // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
+ // to SDWA src:v0 src_sel:WORD_0/BYTE_0
+
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ auto ValSrc = Src1;
+ auto Imm = foldToImm(*Src0);
+
+ if (!Imm) {
+ Imm = foldToImm(*Src1);
+ ValSrc = Src0;
+ }
+
+ if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
+ break;
+
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+ if (TRI->isPhysicalRegister(Src1->getReg()) ||
+ TRI->isPhysicalRegister(Dst->getReg()))
+ break;
+
+ return make_unique<SDWASrcOperand>(
+ ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
+ }
+
+ case AMDGPU::V_OR_B32_e32:
+ case AMDGPU::V_OR_B32_e64: {
+ // Patterns for dst_unused:UNUSED_PRESERVE.
+ // e.g., from:
+ // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
+ // src1_sel:WORD_1 src2_sel:WORD1
+ // v_add_f16_e32 v3, v1, v2
+ // v_or_b32_e32 v4, v0, v3
+ // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
+
+ // Check if one of operands of v_or_b32 is SDWA instruction
+ using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>;
+ auto CheckOROperandsForSDWA =
+ [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
+ if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
+ return CheckRetType(None);
+
+ MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
+ if (!Op1Def)
+ return CheckRetType(None);
+
+ MachineInstr *Op1Inst = Op1Def->getParent();
+ if (!TII->isSDWA(*Op1Inst))
+ return CheckRetType(None);
+
+ MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
+ if (!Op2Def)
+ return CheckRetType(None);
+
+ return CheckRetType(std::make_pair(Op1Def, Op2Def));
+ };
+
+ MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ assert(OrSDWA && OrOther);
+ auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
+ if (!Res) {
+ OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ assert(OrSDWA && OrOther);
+ Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
+ if (!Res)
break;
- }
- case AMDGPU::V_AND_B32_e32:
- case AMDGPU::V_AND_B32_e64: {
- // e.g.:
- // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
- // to SDWA src:v0 src_sel:WORD_0/BYTE_0
-
- MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- auto ValSrc = Src1;
- auto Imm = foldToImm(*Src0);
-
- if (!Imm) {
- Imm = foldToImm(*Src1);
- ValSrc = Src0;
- }
-
- if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
- break;
-
- MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-
- if (TRI->isPhysicalRegister(Src1->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
- break;
-
- auto SDWASrc = make_unique<SDWASrcOperand>(
- ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
- DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
- SDWAOperands[&MI] = std::move(SDWASrc);
+ }
+
+ MachineOperand *OrSDWADef = Res->first;
+ MachineOperand *OrOtherDef = Res->second;
+ assert(OrSDWADef && OrOtherDef);
+
+ MachineInstr *SDWAInst = OrSDWADef->getParent();
+ MachineInstr *OtherInst = OrOtherDef->getParent();
+
+ // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
+ // destination patterns don't overlap. Compatible instruction can be either
+ // regular instruction with compatible bitness or SDWA instruction with
+ // correct dst_sel
+ // SDWAInst | OtherInst bitness / OtherInst dst_sel
+ // -----------------------------------------------------
+ // DWORD | no / no
+ // WORD_0 | no / BYTE_2/3, WORD_1
+ // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0
+ // BYTE_0 | no / BYTE_1/2/3, WORD_1
+ // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1
+ // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0
+ // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0
+ // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
+ // but v_add_f32 is not.
+
+ // TODO: add support for non-SDWA instructions as OtherInst.
+ // For now this only works with SDWA instructions. For regular instructions
+ // there is no way to determine if instruction write only 8/16/24-bit out of
+ // full register size and all registers are at min 32-bit wide.
+ if (!TII->isSDWA(*OtherInst))
+ break;
+
+ SdwaSel DstSel = static_cast<SdwaSel>(
+ TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
+ SdwaSel OtherDstSel = static_cast<SdwaSel>(
+ TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
+
+ bool DstSelAgree = false;
+ switch (DstSel) {
+ case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
+ (OtherDstSel == BYTE_3) ||
+ (OtherDstSel == WORD_1));
+ break;
+ case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
+ (OtherDstSel == BYTE_1) ||
+ (OtherDstSel == WORD_0));
+ break;
+ case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
+ (OtherDstSel == BYTE_2) ||
+ (OtherDstSel == BYTE_3) ||
+ (OtherDstSel == WORD_1));
+ break;
+ case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
+ (OtherDstSel == BYTE_2) ||
+ (OtherDstSel == BYTE_3) ||
+ (OtherDstSel == WORD_1));
+ break;
+ case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
+ (OtherDstSel == BYTE_1) ||
+ (OtherDstSel == BYTE_3) ||
+ (OtherDstSel == WORD_0));
+ break;
+ case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
+ (OtherDstSel == BYTE_1) ||
+ (OtherDstSel == BYTE_2) ||
+ (OtherDstSel == WORD_0));
+ break;
+ default: DstSelAgree = false;
+ }
+
+ if (!DstSelAgree)
+ break;
+
+ // Also OtherInst dst_unused should be UNUSED_PAD
+ DstUnused OtherDstUnused = static_cast<DstUnused>(
+ TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
+ if (OtherDstUnused != DstUnused::UNUSED_PAD)
+ break;
+
+ // Create DstPreserveOperand
+ MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ assert(OrDst && OrDst->isReg());
+
+ return make_unique<SDWADstPreserveOperand>(
+ OrDst, OrSDWADef, OrOtherDef, DstSel);
+
+ }
+ }
+
+ return std::unique_ptr<SDWAOperand>(nullptr);
+}
+
+void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (auto Operand = matchSDWAOperand(MI)) {
+ DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
+ SDWAOperands[&MI] = std::move(Operand);
++NumSDWAPatternsFound;
- break;
- }
}
}
}
@@ -609,12 +818,16 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
const SISubtarget &ST) const {
+ // Check if this is already an SDWA instruction
+ unsigned Opc = MI.getOpcode();
+ if (TII->isSDWA(Opc))
+ return true;
+
// Check if this instruction has opcode that supports SDWA
- int Opc = MI.getOpcode();
if (AMDGPU::getSDWAOp(Opc) == -1)
Opc = AMDGPU::getVOPe32(Opc);
- if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1)
+ if (AMDGPU::getSDWAOp(Opc) == -1)
return false;
if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
@@ -647,9 +860,15 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
const SDWAOperandsVector &SDWAOperands) {
// Convert to sdwa
- int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
- if (SDWAOpcode == -1)
- SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode()));
+ int SDWAOpcode;
+ unsigned Opcode = MI.getOpcode();
+ if (TII->isSDWA(Opcode)) {
+ SDWAOpcode = Opcode;
+ } else {
+ SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
+ if (SDWAOpcode == -1)
+ SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
+ }
assert(SDWAOpcode != -1);
const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
@@ -725,25 +944,44 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
}
}
- // Initialize dst_sel if present
+ // Copy dst_sel if present, initialize otherwise if needed
if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
- SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+ MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
+ if (DstSel) {
+ SDWAInst.add(*DstSel);
+ } else {
+ SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+ }
}
- // Initialize dst_unused if present
+ // Copy dst_unused if present, initialize otherwise if needed
if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
- SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
+ MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+ if (DstUnused) {
+ SDWAInst.add(*DstUnused);
+ } else {
+ SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
+ }
}
- // Initialize src0_sel
+ // Copy src0_sel if present, initialize otherwise
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
- SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
-
+ MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
+ if (Src0Sel) {
+ SDWAInst.add(*Src0Sel);
+ } else {
+ SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+ }
- // Initialize src1_sel if present
+ // Copy src1_sel if present, initialize otherwise if needed
if (Src1) {
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
- SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+ MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
+ if (Src1Sel) {
+ SDWAInst.add(*Src1Sel);
+ } else {
+ SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+ }
}
// Apply all sdwa operand pattenrs
@@ -782,7 +1020,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const {
const MCInstrDesc &Desc = TII->get(MI.getOpcode());
unsigned ConstantBusCount = 0;
- for (MachineOperand &Op: MI.explicit_uses()) {
+ for (MachineOperand &Op : MI.explicit_uses()) {
if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
continue;
@@ -812,7 +1050,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget
bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- if (!ST.hasSDWA())
+ if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
return false;
MRI = &MF.getRegInfo();
@@ -820,27 +1058,35 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
// Find all SDWA operands in MF.
- matchSDWAOperands(MF);
+ bool Changed = false;
+ bool Ret = false;
+ do {
+ matchSDWAOperands(MF);
+
+ for (const auto &OperandPair : SDWAOperands) {
+ const auto &Operand = OperandPair.second;
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+ if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
+ PotentialMatches[PotentialMI].push_back(Operand.get());
+ }
+ }
- for (const auto &OperandPair : SDWAOperands) {
- const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
- if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
- PotentialMatches[PotentialMI].push_back(Operand.get());
+ for (auto &PotentialPair : PotentialMatches) {
+ MachineInstr &PotentialMI = *PotentialPair.first;
+ convertToSDWA(PotentialMI, PotentialPair.second);
}
- }
- for (auto &PotentialPair : PotentialMatches) {
- MachineInstr &PotentialMI = *PotentialPair.first;
- convertToSDWA(PotentialMI, PotentialPair.second);
- }
+ PotentialMatches.clear();
+ SDWAOperands.clear();
+
+ Changed = !ConvertedInstructions.empty();
- PotentialMatches.clear();
- SDWAOperands.clear();
+ if (Changed)
+ Ret = true;
- bool Ret = !ConvertedInstructions.empty();
- while (!ConvertedInstructions.empty())
- legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
+ while (!ConvertedInstructions.empty())
+ legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
+ } while (Changed);
return Ret;
}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 4a3fbb4593bb..65cdc13e03cd 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -148,7 +148,6 @@ unsigned SIRegisterInfo::reservedStackPtrOffsetReg(
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
- Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
// EXEC_LO and EXEC_HI could be allocated and used as regular register, but
// this seems likely to result in bugs, so I'm marking them as reserved.
@@ -173,6 +172,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
+ reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
+ reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
@@ -237,8 +238,15 @@ bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const
return true;
}
-bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
- return MF.getFrameInfo().hasStackObjects();
+bool SIRegisterInfo::requiresFrameIndexScavenging(
+ const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (MFI.hasStackObjects())
+ return true;
+
+ // May need to deal with callee saved registers.
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ return !Info->isEntryFunction();
}
bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
@@ -429,6 +437,10 @@ static int getOffsetMUBUFStore(unsigned Opc) {
return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
+ case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
+ return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
+ case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
+ return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
default:
return -1;
}
@@ -450,6 +462,18 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
+ case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
+ return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
+ case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
+ return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
+ case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
+ return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
+ case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
+ return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
+ case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
+ return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
+ case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
+ return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
default:
return -1;
}
@@ -472,17 +496,21 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
if (LoadStoreOp == -1)
return false;
- unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg();
-
- BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
- .addReg(Reg, getDefRegState(!IsStore))
- .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
- .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
+ MachineInstrBuilder NewMI = BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
+ .add(*Reg)
+ .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
+ AMDGPU::OpName::vdata_in);
+ if (VDataIn)
+ NewMI.add(*VDataIn);
return true;
}
@@ -1045,8 +1073,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
.addImm(Log2_32(ST.getWavefrontSize()))
.addReg(DiffReg);
} else {
- unsigned CarryOut
- = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
unsigned ScaledReg
= MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -1056,8 +1082,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// TODO: Fold if use instruction is another add of a constant.
if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
- .addReg(CarryOut, RegState::Define | RegState::Dead)
+ TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
.addImm(Offset)
.addReg(ScaledReg, RegState::Kill);
} else {
@@ -1066,13 +1091,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
.addImm(Offset);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
- .addReg(CarryOut, RegState::Define | RegState::Dead)
+ TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
.addReg(ConstOffsetReg, RegState::Kill)
.addReg(ScaledReg, RegState::Kill);
}
-
- MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC);
}
// Don't introduce an extra copy if we're just materializing in a mov.
@@ -1275,8 +1297,7 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
return RC;
// We can assume that each lane corresponds to one 32-bit register.
- LaneBitmask::Type Mask = getSubRegIndexLaneMask(SubIdx).getAsInteger();
- unsigned Count = countPopulation(Mask);
+ unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes();
if (isSGPRClass(RC)) {
switch (Count) {
case 1:
@@ -1322,73 +1343,18 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
// class.
//
// e.g. if we have something like
- // vreg0 = ...
- // vreg1 = ...
- // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2
- // vreg3 = COPY vreg2, sub0
+ // %0 = ...
+ // %1 = ...
+ // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
+ // %3 = COPY %2, sub0
//
// We want to look through the COPY to find:
- // => vreg3 = COPY vreg0
+ // => %3 = COPY %0
// Plain copy.
return getCommonSubClass(DefRC, SrcRC) != nullptr;
}
-// FIXME: Most of these are flexible with HSA and we don't need to reserve them
-// as input registers if unused. Whether the dispatch ptr is necessary should be
-// easy to detect from used intrinsics. Scratch setup is harder to know.
-unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
- enum PreloadedValue Value) const {
-
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- (void)ST;
- switch (Value) {
- case SIRegisterInfo::WORKGROUP_ID_X:
- assert(MFI->hasWorkGroupIDX());
- return MFI->WorkGroupIDXSystemSGPR;
- case SIRegisterInfo::WORKGROUP_ID_Y:
- assert(MFI->hasWorkGroupIDY());
- return MFI->WorkGroupIDYSystemSGPR;
- case SIRegisterInfo::WORKGROUP_ID_Z:
- assert(MFI->hasWorkGroupIDZ());
- return MFI->WorkGroupIDZSystemSGPR;
- case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
- return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
- case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
- assert(MFI->hasPrivateSegmentBuffer());
- return MFI->PrivateSegmentBufferUserSGPR;
- case SIRegisterInfo::IMPLICIT_BUFFER_PTR:
- assert(MFI->hasImplicitBufferPtr());
- return MFI->ImplicitBufferPtrUserSGPR;
- case SIRegisterInfo::KERNARG_SEGMENT_PTR:
- assert(MFI->hasKernargSegmentPtr());
- return MFI->KernargSegmentPtrUserSGPR;
- case SIRegisterInfo::DISPATCH_ID:
- assert(MFI->hasDispatchID());
- return MFI->DispatchIDUserSGPR;
- case SIRegisterInfo::FLAT_SCRATCH_INIT:
- assert(MFI->hasFlatScratchInit());
- return MFI->FlatScratchInitUserSGPR;
- case SIRegisterInfo::DISPATCH_PTR:
- assert(MFI->hasDispatchPtr());
- return MFI->DispatchPtrUserSGPR;
- case SIRegisterInfo::QUEUE_PTR:
- assert(MFI->hasQueuePtr());
- return MFI->QueuePtrUserSGPR;
- case SIRegisterInfo::WORKITEM_ID_X:
- assert(MFI->hasWorkItemIDX());
- return AMDGPU::VGPR0;
- case SIRegisterInfo::WORKITEM_ID_Y:
- assert(MFI->hasWorkItemIDY());
- return AMDGPU::VGPR1;
- case SIRegisterInfo::WORKITEM_ID_Z:
- assert(MFI->hasWorkItemIDZ());
- return AMDGPU::VGPR2;
- }
- llvm_unreachable("unexpected preloaded value type");
-}
-
/// \brief Returns a register that is not used at any point in the function.
/// If all registers are used, then this function will return
// AMDGPU::NoRegister.
@@ -1525,7 +1491,8 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
unsigned SubReg,
const TargetRegisterClass *DstRC,
unsigned DstSubReg,
- const TargetRegisterClass *NewRC) const {
+ const TargetRegisterClass *NewRC,
+ LiveIntervals &LIS) const {
unsigned SrcSize = getRegSizeInBits(*SrcRC);
unsigned DstSize = getRegSizeInBits(*DstRC);
unsigned NewSize = getRegSizeInBits(*NewRC);
@@ -1547,7 +1514,7 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
- *MF.getFunction());
+ MF.getFunction());
switch (RC->getID()) {
default:
return AMDGPURegisterInfo::getRegPressureLimit(RC, MF);
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 600cc886cb59..bf814b6974a8 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -22,6 +22,7 @@
namespace llvm {
+class LiveIntervals;
class MachineRegisterInfo;
class SISubtarget;
class SIMachineFunctionInfo;
@@ -63,6 +64,7 @@ public:
BitVector getReservedRegs(const MachineFunction &MF) const override;
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;
@@ -185,31 +187,6 @@ public:
OpType <= AMDGPU::OPERAND_SRC_LAST;
}
- enum PreloadedValue {
- // SGPRS:
- PRIVATE_SEGMENT_BUFFER = 0,
- DISPATCH_PTR = 1,
- QUEUE_PTR = 2,
- KERNARG_SEGMENT_PTR = 3,
- DISPATCH_ID = 4,
- FLAT_SCRATCH_INIT = 5,
- WORKGROUP_ID_X = 10,
- WORKGROUP_ID_Y = 11,
- WORKGROUP_ID_Z = 12,
- PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
- IMPLICIT_BUFFER_PTR = 15,
-
- // VGPRS:
- FIRST_VGPR_VALUE = 16,
- WORKITEM_ID_X = FIRST_VGPR_VALUE,
- WORKITEM_ID_Y = 17,
- WORKITEM_ID_Z = 18
- };
-
- /// \brief Returns the physical register that \p Value is stored in.
- unsigned getPreloadedValue(const MachineFunction &MF,
- enum PreloadedValue Value) const;
-
unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC,
const MachineFunction &MF) const;
@@ -236,7 +213,8 @@ public:
unsigned SubReg,
const TargetRegisterClass *DstRC,
unsigned DstSubReg,
- const TargetRegisterClass *NewRC) const override;
+ const TargetRegisterClass *NewRC,
+ LiveIntervals &LIS) const override;
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index d097b78890e3..6b7c3ffb7bb8 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -77,18 +77,11 @@ def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
let HWEncoding = 110;
}
-def TTMP0 : SIReg <"ttmp0", 112>;
-def TTMP1 : SIReg <"ttmp1", 113>;
-def TTMP2 : SIReg <"ttmp2", 114>;
-def TTMP3 : SIReg <"ttmp3", 115>;
-def TTMP4 : SIReg <"ttmp4", 116>;
-def TTMP5 : SIReg <"ttmp5", 117>;
-def TTMP6 : SIReg <"ttmp6", 118>;
-def TTMP7 : SIReg <"ttmp7", 119>;
-def TTMP8 : SIReg <"ttmp8", 120>;
-def TTMP9 : SIReg <"ttmp9", 121>;
-def TTMP10 : SIReg <"ttmp10", 122>;
-def TTMP11 : SIReg <"ttmp11", 123>;
+foreach Index = 0-15 in {
+ def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>;
+ def TTMP#Index#_gfx9 : SIReg<"ttmp"#Index, !add(108, Index)>;
+ def TTMP#Index : SIReg<"", 0>;
+}
multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
def _ci : SIReg<n, ci_e>;
@@ -192,7 +185,7 @@ def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
// Trap handler TMP 32-bit registers
def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
- (add (sequence "TTMP%u", 0, 11))> {
+ (add (sequence "TTMP%u", 0, 15))> {
let isAllocatable = 0;
}
@@ -208,6 +201,36 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
(add (decimate (shl TTMP_32, 2), 4)),
(add (decimate (shl TTMP_32, 3), 4))]>;
+class TmpRegTuples <string tgt,
+ bit Is64Bit,
+ int Index0,
+ int Index1 = !add(Index0, 1),
+ int Index2 = !add(Index0, !if(Is64Bit, 1, 2)),
+ int Index3 = !add(Index0, !if(Is64Bit, 1, 3)),
+ string name = "ttmp["#Index0#":"#Index3#"]",
+ Register r0 = !cast<Register>("TTMP"#Index0#tgt),
+ Register r1 = !cast<Register>("TTMP"#Index1#tgt),
+ Register r2 = !cast<Register>("TTMP"#Index2#tgt),
+ Register r3 = !cast<Register>("TTMP"#Index3#tgt)> :
+ RegisterWithSubRegs<name, !if(Is64Bit, [r0, r1], [r0, r1, r2, r3])> {
+ let SubRegIndices = !if(Is64Bit, [sub0, sub1], [sub0, sub1, sub2, sub3]);
+ let HWEncoding = r0.HWEncoding;
+}
+
+foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in {
+ def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 1, Index>;
+ def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 1, Index>;
+}
+
+foreach Index = {0, 4, 8, 12} in {
+ def TTMP#Index#_TTMP#!add(Index,1)#
+ _TTMP#!add(Index,2)#
+ _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 0, Index>;
+ def TTMP#Index#_TTMP#!add(Index,1)#
+ _TTMP#!add(Index,2)#
+ _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 0, Index>;
+}
+
// VGPR 32-bit registers
// i16/f16 only on VI+
def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
@@ -269,6 +292,18 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
// Register classes used as source and destination
//===----------------------------------------------------------------------===//
+def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+ (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
+ let isAllocatable = 0;
+ let CopyCost = -1;
+}
+
+def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32,
+ (add PRIVATE_RSRC_REG)> {
+ let isAllocatable = 0;
+ let CopyCost = -1;
+}
+
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
@@ -278,6 +313,11 @@ def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f1
let AllocationPriority = 7;
}
+def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+ (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
+ let AllocationPriority = 7;
+}
+
def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
let AllocationPriority = 7;
@@ -285,7 +325,7 @@ def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> {
+ (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
let AllocationPriority = 7;
}
@@ -466,6 +506,8 @@ defm SSrc : RegImmOperand<"SReg", "SSrc">;
defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;
+def SCSrc_i1 : RegisterOperand<SReg_64_XEXEC>;
+
//===----------------------------------------------------------------------===//
// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 874fbadca7f3..41f989ad3228 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -286,7 +286,7 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
}
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
- if (skipFunction(*MF.getFunction()))
+ if (skipFunction(MF.getFunction()))
return false;
MachineRegisterInfo &MRI = MF.getRegInfo();
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index a613a220e29d..53aefe829737 100644
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -9,7 +9,7 @@
//
/// \file
/// \brief This pass adds instructions to enable whole quad mode for pixel
-/// shaders.
+/// shaders, and whole wavefront mode for all programs.
///
/// Whole quad mode is required for derivative computations, but it interferes
/// with shader side effects (stores and atomics). This pass is run on the
@@ -29,6 +29,13 @@
/// ...
/// S_MOV_B64 EXEC, Tmp
///
+/// We also compute when a sequence of instructions requires Whole Wavefront
+/// Mode (WWM) and insert instructions to save and restore it:
+///
+/// S_OR_SAVEEXEC_B64 Tmp, -1
+/// ...
+/// S_MOV_B64 EXEC, Tmp
+///
/// In order to avoid excessive switching during sequences of Exact
/// instructions, the pass first analyzes which instructions must be run in WQM
/// (aka which instructions produce values that lead to derivative
@@ -54,10 +61,11 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -66,13 +74,13 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetRegisterInfo.h"
#include <cassert>
#include <vector>
@@ -84,7 +92,8 @@ namespace {
enum {
StateWQM = 0x1,
- StateExact = 0x2,
+ StateWWM = 0x2,
+ StateExact = 0x4,
};
struct PrintState {
@@ -94,20 +103,28 @@ public:
explicit PrintState(int State) : State(State) {}
};
+#ifndef NDEBUG
static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
if (PS.State & StateWQM)
OS << "WQM";
- if (PS.State & StateExact) {
+ if (PS.State & StateWWM) {
if (PS.State & StateWQM)
OS << '|';
+ OS << "WWM";
+ }
+ if (PS.State & StateExact) {
+ if (PS.State & (StateWQM | StateWWM))
+ OS << '|';
OS << "Exact";
}
return OS;
}
+#endif
struct InstrInfo {
char Needs = 0;
+ char Disabled = 0;
char OutNeeds = 0;
};
@@ -128,6 +145,7 @@ struct WorkItem {
class SIWholeQuadMode : public MachineFunctionPass {
private:
+ CallingConv::ID CallingConv;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
@@ -136,12 +154,14 @@ private:
DenseMap<const MachineInstr *, InstrInfo> Instructions;
DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
SmallVector<MachineInstr *, 1> LiveMaskQueries;
+ SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
void printInfo();
void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
- void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
+ void markInstructionUses(const MachineInstr &MI, char Flag,
+ std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
@@ -159,9 +179,14 @@ private:
unsigned SaveWQM, unsigned LiveMaskReg);
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
unsigned SavedWQM);
+ void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+ unsigned SaveOrig);
+ void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+ unsigned SavedOrig);
void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
void lowerLiveMaskQueries(unsigned LiveMaskReg);
+ void lowerCopyInstrs();
public:
static char ID;
@@ -196,9 +221,11 @@ FunctionPass *llvm::createSIWholeQuadModePass() {
return new SIWholeQuadMode;
}
-void SIWholeQuadMode::printInfo() {
+#ifndef NDEBUG
+LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
for (const auto &BII : Blocks) {
- dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"
+ dbgs() << "\n"
+ << printMBBReference(*BII.first) << ":\n"
<< " InNeeds = " << PrintState(BII.second.InNeeds)
<< ", Needs = " << PrintState(BII.second.Needs)
<< ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
@@ -213,27 +240,32 @@ void SIWholeQuadMode::printInfo() {
}
}
}
+#endif
void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {
InstrInfo &II = Instructions[&MI];
- assert(Flag == StateWQM || Flag == StateExact);
+ assert(!(Flag & StateExact) && Flag != 0);
- // Ignore if the instruction is already marked. The typical case is that we
- // mark an instruction WQM multiple times, but for atomics it can happen that
- // Flag is StateWQM, but Needs is already set to StateExact. In this case,
- // letting the atomic run in StateExact is correct as per the relevant specs.
- if (II.Needs)
+ // Remove any disabled states from the flag. The user that required it gets
+ // an undefined value in the helper lanes. For example, this can happen if
+ // the result of an atomic is used by instruction that requires WQM, where
+ // ignoring the request for WQM is correct as per the relevant specs.
+ Flag &= ~II.Disabled;
+
+ // Ignore if the flag is already encompassed by the existing needs, or we
+ // just disabled everything.
+ if ((II.Needs & Flag) == Flag)
return;
- II.Needs = Flag;
+ II.Needs |= Flag;
Worklist.push_back(&MI);
}
-/// Mark all instructions defining the uses in \p MI as WQM.
-void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
- std::vector<WorkItem> &Worklist) {
+/// Mark all instructions defining the uses in \p MI with \p Flag.
+void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
+ std::vector<WorkItem> &Worklist) {
for (const MachineOperand &Use : MI.uses()) {
if (!Use.isReg() || !Use.isUse())
continue;
@@ -258,7 +290,7 @@ void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
if (Value->isPHIDef())
continue;
- markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+ markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
Worklist);
}
@@ -266,7 +298,7 @@ void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
}
for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
- markInstruction(DefMI, StateWQM, Worklist);
+ markInstruction(DefMI, Flag, Worklist);
}
}
@@ -275,27 +307,72 @@ void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
std::vector<WorkItem> &Worklist) {
char GlobalFlags = 0;
- bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
-
- for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
- MachineBasicBlock &MBB = *BI;
+ bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
+ SmallVector<MachineInstr *, 4> SetInactiveInstrs;
+
+ // We need to visit the basic blocks in reverse post-order so that we visit
+ // defs before uses, in particular so that we don't accidentally mark an
+ // instruction as needing e.g. WQM before visiting it and realizing it needs
+ // WQM disabled.
+ ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+ for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
+ MachineBasicBlock &MBB = **BI;
+ BlockInfo &BBI = Blocks[&MBB];
for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
MachineInstr &MI = *II;
+ InstrInfo &III = Instructions[&MI];
unsigned Opcode = MI.getOpcode();
char Flags = 0;
- if (TII->isDS(Opcode)) {
+ if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
Flags = StateWQM;
} else if (TII->isWQM(Opcode)) {
// Sampling instructions don't need to produce results for all pixels
// in a quad, they just require all inputs of a quad to have been
// computed for derivatives.
- markUsesWQM(MI, Worklist);
+ markInstructionUses(MI, StateWQM, Worklist);
GlobalFlags |= StateWQM;
continue;
+ } else if (Opcode == AMDGPU::WQM) {
+ // The WQM intrinsic requires its output to have all the helper lanes
+ // correct, so we need it to be in WQM.
+ Flags = StateWQM;
+ LowerToCopyInstrs.push_back(&MI);
+ } else if (Opcode == AMDGPU::WWM) {
+ // The WWM intrinsic doesn't make the same guarantee, and plus it needs
+ // to be executed in WQM or Exact so that its copy doesn't clobber
+ // inactive lanes.
+ markInstructionUses(MI, StateWWM, Worklist);
+ GlobalFlags |= StateWWM;
+ LowerToCopyInstrs.push_back(&MI);
+ continue;
+ } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
+ Opcode == AMDGPU::V_SET_INACTIVE_B64) {
+ III.Disabled = StateWWM;
+ MachineOperand &Inactive = MI.getOperand(2);
+ if (Inactive.isReg()) {
+ if (Inactive.isUndef()) {
+ LowerToCopyInstrs.push_back(&MI);
+ } else {
+ unsigned Reg = Inactive.getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ for (MachineInstr &DefMI : MRI->def_instructions(Reg))
+ markInstruction(DefMI, StateWWM, Worklist);
+ }
+ }
+ }
+ SetInactiveInstrs.push_back(&MI);
+ continue;
} else if (TII->isDisableWQM(MI)) {
- Flags = StateExact;
+ BBI.Needs |= StateExact;
+ if (!(BBI.InNeeds & StateExact)) {
+ BBI.InNeeds |= StateExact;
+ Worklist.push_back(&MBB);
+ }
+ GlobalFlags |= StateExact;
+ III.Disabled = StateWQM | StateWWM;
+ continue;
} else {
if (Opcode == AMDGPU::SI_PS_LIVE) {
LiveMaskQueries.push_back(&MI);
@@ -326,6 +403,14 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
}
}
+ // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
+ // ever used anywhere in the function. This implements the corresponding
+ // semantics of @llvm.amdgcn.set.inactive.
+ if (GlobalFlags & StateWQM) {
+ for (MachineInstr *MI : SetInactiveInstrs)
+ markInstruction(*MI, StateWQM, Worklist);
+ }
+
return GlobalFlags;
}
@@ -337,22 +422,24 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
// Control flow-type instructions and stores to temporary memory that are
// followed by WQM computations must themselves be in WQM.
- if ((II.OutNeeds & StateWQM) && !II.Needs &&
+ if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
(MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
Instructions[&MI].Needs = StateWQM;
II.Needs = StateWQM;
}
// Propagate to block level
- BI.Needs |= II.Needs;
- if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
- BI.InNeeds |= II.Needs;
- Worklist.push_back(MBB);
+ if (II.Needs & StateWQM) {
+ BI.Needs |= StateWQM;
+ if (!(BI.InNeeds & StateWQM)) {
+ BI.InNeeds |= StateWQM;
+ Worklist.push_back(MBB);
+ }
}
// Propagate backwards within block
if (MachineInstr *PrevMI = MI.getPrevNode()) {
- char InNeeds = II.Needs | II.OutNeeds;
+ char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
if (!PrevMI->isPHI()) {
InstrInfo &PrevII = Instructions[PrevMI];
if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
@@ -363,10 +450,10 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
}
// Propagate WQM flag to instruction inputs
- assert(II.Needs != (StateWQM | StateExact));
+ assert(!(II.Needs & StateExact));
- if (II.Needs == StateWQM)
- markUsesWQM(MI, Worklist);
+ if (II.Needs != 0)
+ markInstructionUses(MI, II.Needs, Worklist);
}
void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -558,6 +645,29 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
LIS->InsertMachineInstrInMaps(*MI);
}
+void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before,
+ unsigned SaveOrig) {
+ MachineInstr *MI;
+
+ assert(SaveOrig);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
+ SaveOrig)
+ .addImm(-1);
+ LIS->InsertMachineInstrInMaps(*MI);
+}
+
+void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before,
+ unsigned SavedOrig) {
+ MachineInstr *MI;
+
+ assert(SavedOrig);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
+ .addReg(SavedOrig);
+ LIS->InsertMachineInstrInMaps(*MI);
+}
+
void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
bool isEntry) {
auto BII = Blocks.find(&MBB);
@@ -566,45 +676,66 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
const BlockInfo &BI = BII->second;
- if (!(BI.InNeeds & StateWQM))
- return;
-
// This is a non-entry block that is WQM throughout, so no need to do
// anything.
- if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+ if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
return;
- DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
+ DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n");
unsigned SavedWQMReg = 0;
+ unsigned SavedNonWWMReg = 0;
bool WQMFromExec = isEntry;
- char State = isEntry ? StateExact : StateWQM;
+ char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
+ char NonWWMState = 0;
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
if (isEntry)
++II; // Skip the instruction that saves LiveMask
- MachineBasicBlock::iterator First = IE;
+ // This stores the first instruction where it's safe to switch from WQM to
+ // Exact or vice versa.
+ MachineBasicBlock::iterator FirstWQM = IE;
+
+ // This stores the first instruction where it's safe to switch from WWM to
+ // Exact/WQM or to switch to WWM. It must always be the same as, or after,
+ // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
+ // switch to/from WQM as well.
+ MachineBasicBlock::iterator FirstWWM = IE;
for (;;) {
MachineBasicBlock::iterator Next = II;
- char Needs = 0;
+ char Needs = StateExact | StateWQM; // WWM is disabled by default
char OutNeeds = 0;
- if (First == IE)
- First = II;
+ if (FirstWQM == IE)
+ FirstWQM = II;
+
+ if (FirstWWM == IE)
+ FirstWWM = II;
+ // First, figure out the allowed states (Needs) based on the propagated
+ // flags.
if (II != IE) {
MachineInstr &MI = *II;
if (requiresCorrectState(MI)) {
auto III = Instructions.find(&MI);
if (III != Instructions.end()) {
- Needs = III->second.Needs;
+ if (III->second.Needs & StateWWM)
+ Needs = StateWWM;
+ else if (III->second.Needs & StateWQM)
+ Needs = StateWQM;
+ else
+ Needs &= ~III->second.Disabled;
OutNeeds = III->second.OutNeeds;
}
+ } else {
+ // If the instruction doesn't actually need a correct EXEC, then we can
+ // safely leave WWM enabled.
+ Needs = StateExact | StateWQM | StateWWM;
}
- if (MI.isTerminator() && !Needs && OutNeeds == StateExact)
+ if (MI.isTerminator() && OutNeeds == StateExact)
Needs = StateExact;
if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
@@ -617,20 +748,45 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
Needs = StateWQM;
else if (BI.OutNeeds == StateExact)
Needs = StateExact;
+ else
+ Needs = StateWQM | StateExact;
}
- if (Needs) {
- if (Needs != State) {
- MachineBasicBlock::iterator Before =
- prepareInsertion(MBB, First, II, Needs == StateWQM,
- Needs == StateExact || WQMFromExec);
+ // Now, transition if necessary.
+ if (!(Needs & State)) {
+ MachineBasicBlock::iterator First;
+ if (State == StateWWM || Needs == StateWWM) {
+ // We must switch to or from WWM
+ First = FirstWWM;
+ } else {
+ // We only need to switch to/from WQM, so we can use FirstWQM
+ First = FirstWQM;
+ }
- if (Needs == StateExact) {
+ MachineBasicBlock::iterator Before =
+ prepareInsertion(MBB, First, II, Needs == StateWQM,
+ Needs == StateExact || WQMFromExec);
+
+ if (State == StateWWM) {
+ assert(SavedNonWWMReg);
+ fromWWM(MBB, Before, SavedNonWWMReg);
+ State = NonWWMState;
+ }
+
+ if (Needs == StateWWM) {
+ NonWWMState = State;
+ SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ toWWM(MBB, Before, SavedNonWWMReg);
+ State = StateWWM;
+ } else {
+ if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
if (!WQMFromExec && (OutNeeds & StateWQM))
SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
- } else {
+ State = StateExact;
+ } else if (State == StateExact && (Needs & StateWQM) &&
+ !(Needs & StateExact)) {
assert(WQMFromExec == (SavedWQMReg == 0));
toWQM(MBB, Before, SavedWQMReg);
@@ -639,12 +795,19 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
LIS->createAndComputeVirtRegInterval(SavedWQMReg);
SavedWQMReg = 0;
}
+ State = StateWQM;
+ } else {
+ // We can get here if we transitioned from WWM to a non-WWM state that
+ // already matches our needs, but we shouldn't need to do anything.
+ assert(Needs & State);
}
-
- State = Needs;
}
+ }
- First = IE;
+ if (Needs != (StateExact | StateWQM | StateWWM)) {
+ if (Needs != (StateExact | StateWQM))
+ FirstWQM = IE;
+ FirstWWM = IE;
}
if (II == IE)
@@ -666,13 +829,20 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
}
}
-bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
- if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
- return false;
+void SIWholeQuadMode::lowerCopyInstrs() {
+ for (MachineInstr *MI : LowerToCopyInstrs) {
+ for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
+ MI->RemoveOperand(i);
+ MI->setDesc(TII->get(AMDGPU::COPY));
+ }
+}
+bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
Instructions.clear();
Blocks.clear();
LiveMaskQueries.clear();
+ LowerToCopyInstrs.clear();
+ CallingConv = MF.getFunction().getCallingConv();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
@@ -682,14 +852,13 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LIS = &getAnalysis<LiveIntervals>();
char GlobalFlags = analyzeFunction(MF);
+ unsigned LiveMaskReg = 0;
if (!(GlobalFlags & StateWQM)) {
lowerLiveMaskQueries(AMDGPU::EXEC);
- return !LiveMaskQueries.empty();
- }
-
- // Store a copy of the original live mask when required
- unsigned LiveMaskReg = 0;
- {
+ if (!(GlobalFlags & StateWWM))
+ return !LiveMaskQueries.empty();
+ } else {
+ // Store a copy of the original live mask when required
MachineBasicBlock &Entry = MF.front();
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
@@ -701,13 +870,15 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LIS->InsertMachineInstrInMaps(*MI);
}
+ lowerLiveMaskQueries(LiveMaskReg);
+
if (GlobalFlags == StateWQM) {
// For a shader that needs only WQM, we can just set it once.
BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC)
.addReg(AMDGPU::EXEC);
- lowerLiveMaskQueries(LiveMaskReg);
+ lowerCopyInstrs();
// EntryMI may become invalid here
return true;
}
@@ -715,7 +886,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
DEBUG(printInfo());
- lowerLiveMaskQueries(LiveMaskReg);
+ lowerCopyInstrs();
// Handle the general case
for (auto BII : Blocks)
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 73dd8b7daa4e..8f347986eb8a 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -129,11 +129,8 @@ class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
opName, (outs SReg_64_XEXEC:$sdst), (ins),
" $sdst", [(set i64:$sdst, (node))]> {
let hasSideEffects = 1;
- // FIXME: mayStore = ? is a workaround for tablegen bug for different
- // inferred mayStore flags for the instruction pattern vs. standalone
- // Pat. Each considers the other contradictory.
- let mayStore = ?;
- let mayLoad = ?;
+ let mayStore = 0;
+ let mayLoad = 1;
let has_sbase = 0;
let has_offset = 0;
}
@@ -239,27 +236,24 @@ def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
-def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
-
-let Predicates = [isGCN] in {
multiclass SMRD_Pattern <string Instr, ValueType vt> {
// 1. IMM offset
- def : Pat <
+ def : GCNPat <
(smrd_load (SMRDImm i64:$sbase, i32:$offset)),
(vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
>;
// 2. SGPR offset
- def : Pat <
+ def : GCNPat <
(smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
(vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
>;
}
-let Predicates = [isSICI] in {
-def : Pat <
+let OtherPredicates = [isSICI] in {
+def : GCNPat <
(i64 (readcyclecounter)),
(S_MEMTIME)
>;
@@ -277,29 +271,27 @@ defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
// 1. Offset as an immediate
-def SM_LOAD_PATTERN : Pat < // name this pattern to reuse AddedComplexity on CI
+def SM_LOAD_PATTERN : GCNPat < // name this pattern to reuse AddedComplexity on CI
(SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
(S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0)
>;
// 2. Offset loaded in an 32bit SGPR
-def : Pat <
- (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)),
+def : GCNPat <
+ (SIload_constant v4i32:$sbase, i32:$offset),
(S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0)
>;
} // End let AddedComplexity = 100
-} // let Predicates = [isGCN]
-
-let Predicates = [isVI] in {
+let OtherPredicates = [isVI] in {
-def : Pat <
+def : GCNPat <
(i64 (readcyclecounter)),
(S_MEMREALTIME)
>;
-} // let Predicates = [isVI]
+} // let OtherPredicates = [isVI]
//===----------------------------------------------------------------------===//
@@ -508,10 +500,10 @@ def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>;
let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in {
-class SMRD_Pattern_ci <string Instr, ValueType vt> : Pat <
+class SMRD_Pattern_ci <string Instr, ValueType vt> : GCNPat <
(smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
(vt (!cast<SM_Pseudo>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
- let Predicates = [isCIOnly];
+ let OtherPredicates = [isCIOnly];
}
def : SMRD_Pattern_ci <"S_LOAD_DWORD", i32>;
@@ -520,10 +512,10 @@ def : SMRD_Pattern_ci <"S_LOAD_DWORDX4", v4i32>;
def : SMRD_Pattern_ci <"S_LOAD_DWORDX8", v8i32>;
def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>;
-def : Pat <
+def : GCNPat <
(SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
(S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> {
- let Predicates = [isCI]; // should this be isCIOnly?
+ let OtherPredicates = [isCI]; // should this be isCIOnly?
}
} // End let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index ec29a66c8bbb..02a95a4b6f24 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -139,7 +139,9 @@ let Defs = [SCC] in {
[(set i64:$sdst, (not i64:$src0))]
>;
def S_WQM_B32 : SOP1_32 <"s_wqm_b32">;
- def S_WQM_B64 : SOP1_64 <"s_wqm_b64">;
+ def S_WQM_B64 : SOP1_64 <"s_wqm_b64",
+ [(set i1:$sdst, (int_amdgcn_wqm_vote i1:$src0))]
+ >;
} // End Defs = [SCC]
@@ -159,10 +161,11 @@ def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64">;
def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
+def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">;
+
def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32",
- [(set i32:$sdst, (cttz_zero_undef i32:$src0))]
+ [(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))]
>;
-def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">;
def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32",
[(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
@@ -391,6 +394,14 @@ def S_XOR_B32 : SOP2_32 <"s_xor_b32",
def S_XOR_B64 : SOP2_64 <"s_xor_b64",
[(set i64:$sdst, (xor i64:$src0, i64:$src1))]
>;
+
+def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
+ [(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))]
+>;
+
+def S_XNOR_B64 : SOP2_64 <"s_xnor_b64",
+ [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))]
+>;
} // End isCommutable = 1
def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">;
@@ -401,8 +412,6 @@ def S_NAND_B32 : SOP2_32 <"s_nand_b32">;
def S_NAND_B64 : SOP2_64 <"s_nand_b64">;
def S_NOR_B32 : SOP2_32 <"s_nor_b32">;
def S_NOR_B64 : SOP2_64 <"s_nor_b64">;
-def S_XNOR_B32 : SOP2_32 <"s_xnor_b32">;
-def S_XNOR_B64 : SOP2_64 <"s_xnor_b64">;
} // End Defs = [SCC]
// Use added complexity so these patterns are preferred to the VALU patterns.
@@ -811,8 +820,7 @@ def S_CBRANCH_SCC0 : SOPP <
>;
def S_CBRANCH_SCC1 : SOPP <
0x00000005, (ins sopp_brtarget:$simm16),
- "s_cbranch_scc1 $simm16",
- [(si_uniform_br_scc SCC, bb:$simm16)]
+ "s_cbranch_scc1 $simm16"
>;
} // End Uses = [SCC]
@@ -942,12 +950,10 @@ def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16),
}
}
-let Predicates = [isGCN] in {
-
//===----------------------------------------------------------------------===//
// S_GETREG_B32 Intrinsic Pattern.
//===----------------------------------------------------------------------===//
-def : Pat <
+def : GCNPat <
(int_amdgcn_s_getreg imm:$simm16),
(S_GETREG_B32 (as_i16imm $simm16))
>;
@@ -956,25 +962,25 @@ def : Pat <
// SOP1 Patterns
//===----------------------------------------------------------------------===//
-def : Pat <
+def : GCNPat <
(i64 (ctpop i64:$src)),
(i64 (REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
(S_MOV_B32 (i32 0)), sub1))
>;
-def : Pat <
+def : GCNPat <
(i32 (smax i32:$x, (i32 (ineg i32:$x)))),
(S_ABS_I32 $x)
>;
-def : Pat <
+def : GCNPat <
(i16 imm:$imm),
(S_MOV_B32 imm:$imm)
>;
// Same as a 32-bit inreg
-def : Pat<
+def : GCNPat<
(i32 (sext i16:$src)),
(S_SEXT_I32_I16 $src)
>;
@@ -986,7 +992,7 @@ def : Pat<
// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
// case, the sgpr-copies pass will fix this to use the vector version.
-def : Pat <
+def : GCNPat <
(i32 (addc i32:$src0, i32:$src1)),
(S_ADD_U32 $src0, $src1)
>;
@@ -994,20 +1000,20 @@ def : Pat <
// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
// REG_SEQUENCE patterns don't support instructions with multiple
// outputs.
-def : Pat<
+def : GCNPat<
(i64 (zext i16:$src)),
(REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0,
(S_MOV_B32 (i32 0)), sub1)
>;
-def : Pat <
+def : GCNPat <
(i64 (sext i16:$src)),
(REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0,
(i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1)
>;
-def : Pat<
+def : GCNPat<
(i32 (zext i16:$src)),
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
>;
@@ -1018,13 +1024,11 @@ def : Pat<
// SOPP Patterns
//===----------------------------------------------------------------------===//
-def : Pat <
+def : GCNPat <
(int_amdgcn_s_waitcnt i32:$simm16),
(S_WAITCNT (as_i16imm $simm16))
>;
-} // End isGCN predicate
-
//===----------------------------------------------------------------------===//
// Real target instructions, move this to the appropriate subtarget TD file
diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index 92fb762ebd73..f61e2e413ad4 100644
--- a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -31,7 +31,7 @@ Target &llvm::getTheGCNTarget() {
/// \brief Extern function to initialize the targets for the AMDGPU backend
extern "C" void LLVMInitializeAMDGPUTargetInfo() {
RegisterTarget<Triple::r600, false> R600(getTheAMDGPUTarget(), "r600",
- "AMD GPUs HD2XXX-HD6XXX");
+ "AMD GPUs HD2XXX-HD6XXX", "AMDGPU");
RegisterTarget<Triple::amdgcn, false> GCN(getTheGCNTarget(), "amdgcn",
- "AMD GCN GPUs");
+ "AMD GCN GPUs", "AMDGPU");
}
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 67ad904ca972..819a7add0be4 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Module.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -39,7 +40,9 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#define GET_INSTRINFO_NAMED_OPS
+#define GET_INSTRMAP_INFO
#include "AMDGPUGenInstrInfo.inc"
+#undef GET_INSTRMAP_INFO
#undef GET_INSTRINFO_NAMED_OPS
namespace {
@@ -100,15 +103,76 @@ static cl::opt<bool> EnablePackedInlinableLiterals(
namespace AMDGPU {
+LLVM_READNONE
+static inline Channels indexToChannel(unsigned Channel) {
+ switch (Channel) {
+ case 1:
+ return AMDGPU::Channels_1;
+ case 2:
+ return AMDGPU::Channels_2;
+ case 3:
+ return AMDGPU::Channels_3;
+ case 4:
+ return AMDGPU::Channels_4;
+ default:
+ llvm_unreachable("invalid MIMG channel");
+ }
+}
+
+
+// FIXME: Need to handle d16 images correctly.
+static unsigned rcToChannels(unsigned RCID) {
+ switch (RCID) {
+ case AMDGPU::VGPR_32RegClassID:
+ return 1;
+ case AMDGPU::VReg_64RegClassID:
+ return 2;
+ case AMDGPU::VReg_96RegClassID:
+ return 3;
+ case AMDGPU::VReg_128RegClassID:
+ return 4;
+ default:
+ llvm_unreachable("invalid MIMG register class");
+ }
+}
+
+int getMaskedMIMGOp(const MCInstrInfo &MII, unsigned Opc, unsigned NewChannels) {
+ AMDGPU::Channels Channel = AMDGPU::indexToChannel(NewChannels);
+ unsigned OrigChannels = rcToChannels(MII.get(Opc).OpInfo[0].RegClass);
+ if (NewChannels == OrigChannels)
+ return Opc;
+
+ switch (OrigChannels) {
+ case 1:
+ return AMDGPU::getMaskedMIMGOp1(Opc, Channel);
+ case 2:
+ return AMDGPU::getMaskedMIMGOp2(Opc, Channel);
+ case 3:
+ return AMDGPU::getMaskedMIMGOp3(Opc, Channel);
+ case 4:
+ return AMDGPU::getMaskedMIMGOp4(Opc, Channel);
+ default:
+ llvm_unreachable("invalid MIMG channel");
+ }
+}
+
+// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
+// header files, so we need to wrap it in a function that takes unsigned
+// instead.
+int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+ return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
+}
+
namespace IsaInfo {
IsaVersion getIsaVersion(const FeatureBitset &Features) {
- // SI.
+ // GCN GFX6 (Southern Islands (SI)).
if (Features.test(FeatureISAVersion6_0_0))
return {6, 0, 0};
if (Features.test(FeatureISAVersion6_0_1))
return {6, 0, 1};
- // CI.
+
+ // GCN GFX7 (Sea Islands (CI)).
if (Features.test(FeatureISAVersion7_0_0))
return {7, 0, 0};
if (Features.test(FeatureISAVersion7_0_1))
@@ -117,8 +181,10 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
return {7, 0, 2};
if (Features.test(FeatureISAVersion7_0_3))
return {7, 0, 3};
+ if (Features.test(FeatureISAVersion7_0_4))
+ return {7, 0, 4};
- // VI.
+ // GCN GFX8 (Volcanic Islands (VI)).
if (Features.test(FeatureISAVersion8_0_0))
return {8, 0, 0};
if (Features.test(FeatureISAVersion8_0_1))
@@ -127,26 +193,39 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
return {8, 0, 2};
if (Features.test(FeatureISAVersion8_0_3))
return {8, 0, 3};
- if (Features.test(FeatureISAVersion8_0_4))
- return {8, 0, 4};
if (Features.test(FeatureISAVersion8_1_0))
return {8, 1, 0};
- // GFX9.
+ // GCN GFX9.
if (Features.test(FeatureISAVersion9_0_0))
return {9, 0, 0};
- if (Features.test(FeatureISAVersion9_0_1))
- return {9, 0, 1};
if (Features.test(FeatureISAVersion9_0_2))
return {9, 0, 2};
- if (Features.test(FeatureISAVersion9_0_3))
- return {9, 0, 3};
if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands))
return {0, 0, 0};
return {7, 0, 0};
}
+void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
+ auto TargetTriple = STI->getTargetTriple();
+ auto ISAVersion = IsaInfo::getIsaVersion(STI->getFeatureBits());
+
+ Stream << TargetTriple.getArchName() << '-'
+ << TargetTriple.getVendorName() << '-'
+ << TargetTriple.getOSName() << '-'
+ << TargetTriple.getEnvironmentName() << '-'
+ << "gfx"
+ << ISAVersion.Major
+ << ISAVersion.Minor
+ << ISAVersion.Stepping;
+ Stream.flush();
+}
+
+bool hasCodeObjectV3(const FeatureBitset &Features) {
+ return Features.test(FeatureCodeObjectV3);
+}
+
unsigned getWavefrontSize(const FeatureBitset &Features) {
if (Features.test(FeatureWavefrontSize16))
return 16;
@@ -337,16 +416,16 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
Header.private_segment_alignment = 4;
}
-bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS) {
- return GV->getType()->getAddressSpace() == AS.LOCAL_ADDRESS;
+bool isGroupSegment(const GlobalValue *GV) {
+ return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}
-bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS) {
- return GV->getType()->getAddressSpace() == AS.GLOBAL_ADDRESS;
+bool isGlobalSegment(const GlobalValue *GV) {
+ return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
}
-bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS) {
- return GV->getType()->getAddressSpace() == AS.CONSTANT_ADDRESS;
+bool isReadOnlySegment(const GlobalValue *GV) {
+ return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
}
bool shouldEmitConstantsToTextSection(const Triple &TT) {
@@ -486,7 +565,9 @@ unsigned getInitialPSInputAddr(const Function &F) {
bool isShader(CallingConv::ID cc) {
switch(cc) {
case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_LS:
case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
@@ -508,7 +589,9 @@ bool isEntryFunctionCC(CallingConv::ID CC) {
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_LS:
return true;
default:
return false;
@@ -531,6 +614,10 @@ bool isGFX9(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
}
+bool isGCN3Encoding(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
+}
+
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);
@@ -545,44 +632,68 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
return false;
}
-unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
+#define MAP_REG2REG \
+ using namespace AMDGPU; \
+ switch(Reg) { \
+ default: return Reg; \
+ CASE_CI_VI(FLAT_SCR) \
+ CASE_CI_VI(FLAT_SCR_LO) \
+ CASE_CI_VI(FLAT_SCR_HI) \
+ CASE_VI_GFX9(TTMP0) \
+ CASE_VI_GFX9(TTMP1) \
+ CASE_VI_GFX9(TTMP2) \
+ CASE_VI_GFX9(TTMP3) \
+ CASE_VI_GFX9(TTMP4) \
+ CASE_VI_GFX9(TTMP5) \
+ CASE_VI_GFX9(TTMP6) \
+ CASE_VI_GFX9(TTMP7) \
+ CASE_VI_GFX9(TTMP8) \
+ CASE_VI_GFX9(TTMP9) \
+ CASE_VI_GFX9(TTMP10) \
+ CASE_VI_GFX9(TTMP11) \
+ CASE_VI_GFX9(TTMP12) \
+ CASE_VI_GFX9(TTMP13) \
+ CASE_VI_GFX9(TTMP14) \
+ CASE_VI_GFX9(TTMP15) \
+ CASE_VI_GFX9(TTMP0_TTMP1) \
+ CASE_VI_GFX9(TTMP2_TTMP3) \
+ CASE_VI_GFX9(TTMP4_TTMP5) \
+ CASE_VI_GFX9(TTMP6_TTMP7) \
+ CASE_VI_GFX9(TTMP8_TTMP9) \
+ CASE_VI_GFX9(TTMP10_TTMP11) \
+ CASE_VI_GFX9(TTMP12_TTMP13) \
+ CASE_VI_GFX9(TTMP14_TTMP15) \
+ CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3) \
+ CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \
+ CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \
+ CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \
+ }
- switch(Reg) {
- default: break;
- case AMDGPU::FLAT_SCR:
- assert(!isSI(STI));
- return isCI(STI) ? AMDGPU::FLAT_SCR_ci : AMDGPU::FLAT_SCR_vi;
+#define CASE_CI_VI(node) \
+ assert(!isSI(STI)); \
+ case node: return isCI(STI) ? node##_ci : node##_vi;
- case AMDGPU::FLAT_SCR_LO:
- assert(!isSI(STI));
- return isCI(STI) ? AMDGPU::FLAT_SCR_LO_ci : AMDGPU::FLAT_SCR_LO_vi;
+#define CASE_VI_GFX9(node) \
+ case node: return isGFX9(STI) ? node##_gfx9 : node##_vi;
- case AMDGPU::FLAT_SCR_HI:
- assert(!isSI(STI));
- return isCI(STI) ? AMDGPU::FLAT_SCR_HI_ci : AMDGPU::FLAT_SCR_HI_vi;
- }
- return Reg;
+unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
+ MAP_REG2REG
}
-unsigned mc2PseudoReg(unsigned Reg) {
- switch (Reg) {
- case AMDGPU::FLAT_SCR_ci:
- case AMDGPU::FLAT_SCR_vi:
- return FLAT_SCR;
+#undef CASE_CI_VI
+#undef CASE_VI_GFX9
- case AMDGPU::FLAT_SCR_LO_ci:
- case AMDGPU::FLAT_SCR_LO_vi:
- return AMDGPU::FLAT_SCR_LO;
+#define CASE_CI_VI(node) case node##_ci: case node##_vi: return node;
+#define CASE_VI_GFX9(node) case node##_vi: case node##_gfx9: return node;
- case AMDGPU::FLAT_SCR_HI_ci:
- case AMDGPU::FLAT_SCR_HI_vi:
- return AMDGPU::FLAT_SCR_HI;
-
- default:
- return Reg;
- }
+unsigned mc2PseudoReg(unsigned Reg) {
+ MAP_REG2REG
}
+#undef CASE_CI_VI
+#undef CASE_VI_GFX9
+#undef MAP_REG2REG
+
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
assert(OpNo < Desc.NumOperands);
unsigned OpType = Desc.OpInfo[OpNo].OperandType;
@@ -730,59 +841,66 @@ bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
}
+bool isArgPassedInSGPR(const Argument *A) {
+ const Function *F = A->getParent();
+
+ // Arguments to compute shaders are never a source of divergence.
+ CallingConv::ID CC = F->getCallingConv();
+ switch (CC) {
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ return true;
+ case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_LS:
+ case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_ES:
+ case CallingConv::AMDGPU_GS:
+ case CallingConv::AMDGPU_PS:
+ case CallingConv::AMDGPU_CS:
+ // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
+ // Everything else is in VGPRs.
+ return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
+ F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
+ default:
+ // TODO: Should calls support inreg for SGPR inputs?
+ return false;
+ }
+}
+
+// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
bool isUniformMMO(const MachineMemOperand *MMO) {
const Value *Ptr = MMO->getValue();
// UndefValue means this is a load of a kernel input. These are uniform.
// Sometimes LDS instructions have constant pointers.
// If Ptr is null, then that means this mem operand contains a
// PseudoSourceValue like GOT.
- if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
+ if (!Ptr || isa<UndefValue>(Ptr) ||
isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
return true;
+ if (const Argument *Arg = dyn_cast<Argument>(Ptr))
+ return isArgPassedInSGPR(Arg);
+
const Instruction *I = dyn_cast<Instruction>(Ptr);
return I && I->getMetadata("amdgpu.uniform");
}
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
- if (isSI(ST) || isCI(ST))
- return ByteOffset >> 2;
-
- return ByteOffset;
+ if (isGCN3Encoding(ST))
+ return ByteOffset;
+ return ByteOffset >> 2;
}
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset);
- return isSI(ST) || isCI(ST) ? isUInt<8>(EncodedOffset) :
- isUInt<20>(EncodedOffset);
+ return isGCN3Encoding(ST) ?
+ isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset);
}
+
} // end namespace AMDGPU
} // end namespace llvm
-const unsigned AMDGPUAS::MAX_COMMON_ADDRESS;
-const unsigned AMDGPUAS::GLOBAL_ADDRESS;
-const unsigned AMDGPUAS::LOCAL_ADDRESS;
-const unsigned AMDGPUAS::PARAM_D_ADDRESS;
-const unsigned AMDGPUAS::PARAM_I_ADDRESS;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_0;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_1;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_2;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_3;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_4;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_5;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_6;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_7;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_8;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_9;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_10;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_11;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_12;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_13;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_14;
-const unsigned AMDGPUAS::CONSTANT_BUFFER_15;
-const unsigned AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
-
namespace llvm {
namespace AMDGPU {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 936e4921a709..a215b445378e 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -19,10 +19,12 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include <cstdint>
+#include <string>
#include <utility>
namespace llvm {
+class Argument;
class FeatureBitset;
class Function;
class GlobalValue;
@@ -53,6 +55,13 @@ struct IsaVersion {
/// \returns Isa version for given subtarget \p Features.
IsaVersion getIsaVersion(const FeatureBitset &Features);
+/// \brief Streams isa version string for given subtarget \p STI into \p Stream.
+void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
+
+/// \returns True if given subtarget \p Features support code object version 3,
+/// false otherwise.
+bool hasCodeObjectV3(const FeatureBitset &Features);
+
/// \returns Wavefront size for given subtarget \p Features.
unsigned getWavefrontSize(const FeatureBitset &Features);
@@ -147,12 +156,18 @@ unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
LLVM_READONLY
int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+LLVM_READONLY
+int getMaskedMIMGOp(const MCInstrInfo &MII,
+ unsigned Opc, unsigned NewChannels);
+LLVM_READONLY
+int getMCOpcode(uint16_t Opcode, unsigned Gen);
+
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
const FeatureBitset &Features);
-bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS);
-bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS);
-bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS);
+bool isGroupSegment(const GlobalValue *GV);
+bool isGlobalSegment(const GlobalValue *GV);
+bool isReadOnlySegment(const GlobalValue *GV);
/// \returns True if constants should be emitted to .text section for given
/// target triple \p TT, false otherwise.
@@ -347,6 +362,7 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
LLVM_READNONE
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
+bool isArgPassedInSGPR(const Argument *Arg);
bool isUniformMMO(const MachineMemOperand *MMO);
/// \returns The encoding that will be used for \p ByteOffset in the SMRD
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
index 0333b0a14d29..20059f4a1ed7 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -1,4 +1,4 @@
-//===--------------------AMDKernelCodeTUtils.cpp --------------------------===//
+//===- AMDKernelCodeTUtils.cpp --------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,17 +7,21 @@
//
//===----------------------------------------------------------------------===//
//
-//===----------------------------------------------------------------------===//
-//
/// \file - utility functions to parse/print amd_kernel_code_t structure
//
//===----------------------------------------------------------------------===//
#include "AMDKernelCodeTUtils.h"
#include "SIDefines.h"
-#include <llvm/MC/MCParser/MCAsmLexer.h>
-#include <llvm/MC/MCParser/MCAsmParser.h>
-#include <llvm/Support/raw_ostream.h>
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
using namespace llvm;
@@ -62,7 +66,6 @@ static StringRef get_amd_kernel_code_t_FieldName(int index) {
return get_amd_kernel_code_t_FldNames()[index + 1];
}
-
// Field printing
static raw_ostream &printName(raw_ostream &OS, StringRef Name) {
@@ -82,9 +85,7 @@ static void printBitField(StringRef Name, const amd_kernel_code_t &c,
printName(OS, Name) << (int)((c.*ptr >> shift) & Mask);
}
-typedef void(*PrintFx)(StringRef,
- const amd_kernel_code_t &,
- raw_ostream &);
+using PrintFx = void(*)(StringRef, const amd_kernel_code_t &, raw_ostream &);
static ArrayRef<PrintFx> getPrinterTable() {
static const PrintFx Table[] = {
@@ -114,7 +115,6 @@ void llvm::dumpAmdKernelCode(const amd_kernel_code_t *C,
}
}
-
// Field parsing
static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value, raw_ostream& Err) {
@@ -154,9 +154,8 @@ static bool parseBitField(amd_kernel_code_t &C, MCAsmParser &MCParser,
return true;
}
-typedef bool(*ParseFx)(amd_kernel_code_t &,
- MCAsmParser &MCParser,
- raw_ostream &Err);
+using ParseFx = bool(*)(amd_kernel_code_t &, MCAsmParser &MCParser,
+ raw_ostream &Err);
static ArrayRef<ParseFx> getParserTable() {
static const ParseFx Table[] = {
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
index d9edca7a82ac..ef9f9bdb6bcb 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
@@ -1,4 +1,4 @@
-//===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t *- C++ -*-===//
+//===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t -*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,34 +6,31 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
+//
/// \file AMDKernelCodeTUtils.h
+//
//===----------------------------------------------------------------------===//
-#ifndef AMDKERNELCODETUTILS_H
-#define AMDKERNELCODETUTILS_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H
#include "AMDKernelCodeT.h"
namespace llvm {
-class MCAsmLexer;
class MCAsmParser;
class raw_ostream;
class StringRef;
-void printAmdKernelCodeField(const amd_kernel_code_t &C,
- int FldIndex,
- raw_ostream &OS);
+void printAmdKernelCodeField(const amd_kernel_code_t &C, int FldIndex,
+ raw_ostream &OS);
-void dumpAmdKernelCode(const amd_kernel_code_t *C,
- raw_ostream &OS,
- const char *tab);
+void dumpAmdKernelCode(const amd_kernel_code_t *C, raw_ostream &OS,
+ const char *tab);
-bool parseAmdKernelCodeField(StringRef ID,
- MCAsmParser &Parser,
- amd_kernel_code_t &C,
- raw_ostream &Err);
+bool parseAmdKernelCodeField(StringRef ID, MCAsmParser &Parser,
+ amd_kernel_code_t &C, raw_ostream &Err);
-}
+} // end namespace llvm
-#endif // AMDKERNELCODETUTILS_H
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 96b33c373f05..ff2bd2454400 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -266,7 +266,8 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
let Outs = (outs);
let Ins32 = (ins Src0RC32:$vdst, VSrc_b32:$src0);
let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
- let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ let InsDPP = (ins DstRC:$vdst, DstRC:$old, Src0RC32:$src0,
+ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
@@ -274,7 +275,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
src0_sel:$src0_sel);
let Asm32 = getAsm32<1, 1>.ret;
- let Asm64 = getAsm64<1, 1, 0, 1>.ret;
+ let Asm64 = getAsm64<1, 1, 0, 0, 1>.ret;
let AsmDPP = getAsmDPP<1, 1, 0>.ret;
let AsmSDWA = getAsmSDWA<1, 1>.ret;
let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret;
@@ -360,14 +361,14 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
}
-let Predicates = [Has16BitInsts] in {
+let OtherPredicates = [Has16BitInsts] in {
-def : Pat<
+def : GCNPat<
(f32 (f16_to_fp i16:$src)),
(V_CVT_F32_F16_e32 $src)
>;
-def : Pat<
+def : GCNPat<
(i16 (AMDGPUfp_to_f16 f32:$src)),
(V_CVT_F16_F32_e32 $src)
>;
@@ -504,8 +505,6 @@ class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
let Uses = ps.Uses;
let SchedRW = ps.SchedRW;
let hasSideEffects = ps.hasSideEffects;
- let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
bits<8> vdst;
let Inst{8-0} = 0xfa; // dpp
@@ -654,36 +653,44 @@ def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
-let Predicates = [isVI] in {
+let OtherPredicates = [isVI] in {
-def : Pat <
+def : GCNPat <
(i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
imm:$bound_ctrl)),
- (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
- (as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
+ (V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl),
+ (as_i32imm $row_mask), (as_i32imm $bank_mask),
+ (as_i1imm $bound_ctrl))
>;
+def : GCNPat <
+ (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask,
+ imm:$bank_mask, imm:$bound_ctrl)),
+ (V_MOV_B32_dpp $old, $src, (as_i32imm $dpp_ctrl),
+ (as_i32imm $row_mask), (as_i32imm $bank_mask),
+ (as_i1imm $bound_ctrl))
+>;
-def : Pat<
+def : GCNPat<
(i32 (anyext i16:$src)),
(COPY $src)
>;
-def : Pat<
+def : GCNPat<
(i64 (anyext i16:$src)),
(REG_SEQUENCE VReg_64,
(i32 (COPY $src)), sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
-def : Pat<
+def : GCNPat<
(i16 (trunc i32:$src)),
(COPY $src)
>;
-def : Pat <
+def : GCNPat <
(i16 (trunc i64:$src)),
(EXTRACT_SUBREG $src, sub0)
>;
-} // End Predicates = [isVI]
+} // End OtherPredicates = [isVI]
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index d5acb49b4f39..ef90b68db1a8 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -128,35 +128,42 @@ class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
multiclass VOP2Inst <string opName,
VOPProfile P,
SDPatternOperator node = null_frag,
- string revOp = opName> {
+ string revOp = opName,
+ bit GFX9Renamed = 0> {
- def _e32 : VOP2_Pseudo <opName, P>,
- Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+ let renamedInGFX9 = GFX9Renamed in {
- def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
- Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+ def _e32 : VOP2_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
- def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+ def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+
+ }
}
multiclass VOP2bInst <string opName,
VOPProfile P,
SDPatternOperator node = null_frag,
string revOp = opName,
+ bit GFX9Renamed = 0,
bit useSGPRInput = !eq(P.NumSrcArgs, 3)> {
-
- let SchedRW = [Write32Bit, WriteSALU] in {
- let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
- def _e32 : VOP2_Pseudo <opName, P>,
- Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
-
- def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
- let AsmMatchConverter = "cvtSdwaVOP2b";
+ let renamedInGFX9 = GFX9Renamed in {
+ let SchedRW = [Write32Bit, WriteSALU] in {
+ let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
+ def _e32 : VOP2_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
+ let AsmMatchConverter = "cvtSdwaVOP2b";
+ }
}
- }
- def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
- Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+ def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+ }
}
}
@@ -208,10 +215,10 @@ def VOP_MADMK_F32 : VOP_MADMK <f32>;
class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
- HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
- let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+ 0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
+ let InsDPP = (ins DstRCDPP:$old,
+ Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
- VGPR_32:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
@@ -222,7 +229,7 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
let Asm32 = getAsm32<1, 2, vt>.ret;
- let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret;
+ let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt>.ret;
let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
let AsmSDWA = getAsmSDWA<1, 2, vt>.ret;
let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret;
@@ -235,13 +242,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
def VOP_MAC_F16 : VOP_MAC <f16> {
// FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
// 'not a string initializer' error.
- let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f16>.ret;
+ let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, f16>.ret;
}
def VOP_MAC_F32 : VOP_MAC <f32> {
// FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
// 'not a string initializer' error.
- let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f32>.ret;
+ let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, f32>.ret;
}
// Write out to vcc or arbitrary SGPR.
@@ -278,12 +285,13 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
- clampmod:$clamp, omod:$omod,
+ clampmod:$clamp,
dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
- let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,
- Src1Mod:$src1_modifiers, Src1DPP:$src1,
+ let InsDPP = (ins DstRCDPP:$old,
+ Src0DPP:$src0,
+ Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let HasExt = 1;
@@ -369,12 +377,20 @@ def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">;
// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
// but the VI instructions behave the same as the SI versions.
-defm V_ADD_I32 : VOP2bInst <"v_add_i32", VOP2b_I32_I1_I32_I32>;
-defm V_SUB_I32 : VOP2bInst <"v_sub_i32", VOP2b_I32_I1_I32_I32>;
-defm V_SUBREV_I32 : VOP2bInst <"v_subrev_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32">;
-defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1>;
-defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1>;
-defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32">;
+defm V_ADD_I32 : VOP2bInst <"v_add_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_i32", 1>;
+defm V_SUB_I32 : VOP2bInst <"v_sub_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32", 1>;
+defm V_SUBREV_I32 : VOP2bInst <"v_subrev_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32", 1>;
+defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32", 1>;
+defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>;
+defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>;
+
+
+let SubtargetPredicate = HasAddNoCarryInsts in {
+defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32, null_frag, "v_add_u32", 1>;
+defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>;
+defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>;
+}
+
} // End isCommutable = 1
// These are special and do not read the exec mask.
@@ -399,12 +415,12 @@ defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32
} // End SubtargetPredicate = isGCN
-def : Pat<
+def : GCNPat<
(AMDGPUadde i32:$src0, i32:$src1, i1:$src2),
(V_ADDC_U32_e64 $src0, $src1, $src2)
>;
-def : Pat<
+def : GCNPat<
(AMDGPUsube i32:$src0, i32:$src1, i1:$src2),
(V_SUBB_U32_e64 $src0, $src1, $src2)
>;
@@ -460,17 +476,17 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
// Note: 16-bit instructions produce a 0 result in the high 16-bits.
multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
-def : Pat<
+def : GCNPat<
(op i16:$src0, i16:$src1),
(inst $src0, $src1)
>;
-def : Pat<
+def : GCNPat<
(i32 (zext (op i16:$src0, i16:$src1))),
(inst $src0, $src1)
>;
-def : Pat<
+def : GCNPat<
(i64 (zext (op i16:$src0, i16:$src1))),
(REG_SEQUENCE VReg_64,
(inst $src0, $src1), sub0,
@@ -481,18 +497,18 @@ def : Pat<
multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst> {
-def : Pat<
+def : GCNPat<
(op i16:$src0, i16:$src1),
(inst $src1, $src0)
>;
-def : Pat<
+def : GCNPat<
(i32 (zext (op i16:$src0, i16:$src1))),
(inst $src1, $src0)
>;
-def : Pat<
+def : GCNPat<
(i64 (zext (op i16:$src0, i16:$src1))),
(REG_SEQUENCE VReg_64,
(inst $src1, $src0), sub0,
@@ -500,7 +516,7 @@ def : Pat<
>;
}
-class ZExt_i16_i1_Pat <SDNode ext> : Pat <
+class ZExt_i16_i1_Pat <SDNode ext> : GCNPat <
(i16 (ext i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)
>;
@@ -515,17 +531,17 @@ defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>;
defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>;
defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>;
-def : Pat <
+def : GCNPat <
(and i16:$src0, i16:$src1),
(V_AND_B32_e64 $src0, $src1)
>;
-def : Pat <
+def : GCNPat <
(or i16:$src0, i16:$src1),
(V_OR_B32_e64 $src0, $src1)
>;
-def : Pat <
+def : GCNPat <
(xor i16:$src0, i16:$src1),
(V_XOR_B32_e64 $src0, $src1)
>;
@@ -537,7 +553,7 @@ defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>;
def : ZExt_i16_i1_Pat<zext>;
def : ZExt_i16_i1_Pat<anyext>;
-def : Pat <
+def : GCNPat <
(i16 (sext i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)
>;
@@ -545,7 +561,7 @@ def : Pat <
// Undo sub x, c -> add x, -c canonicalization since c is more likely
// an inline immediate than -c.
// TODO: Also do for 64-bit.
-def : Pat<
+def : GCNPat<
(add i16:$src0, (i16 NegSubInlineConst16:$src1)),
(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
>;
@@ -651,14 +667,12 @@ defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>;
// VI
//===----------------------------------------------------------------------===//
-class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, VOPProfile P = ps.Pfl> :
- VOP_DPP <ps.OpName, P> {
+class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfile P = ps.Pfl> :
+ VOP_DPP <OpName, P> {
let Defs = ps.Defs;
let Uses = ps.Uses;
let SchedRW = ps.SchedRW;
let hasSideEffects = ps.hasSideEffects;
- let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
bits<8> vdst;
bits<8> src1;
@@ -705,12 +719,6 @@ multiclass VOP2_Real_e64only_vi <bits<10> op> {
}
}
-multiclass Base_VOP2be_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op> {
- def _e64_vi :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
- VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
-}
-
multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
VOP2_Real_e32_vi<op>,
VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>;
@@ -729,13 +737,86 @@ multiclass VOP2_SDWA9_Real <bits<6> op> {
VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
}
-multiclass VOP2be_Real_e32e64_vi <bits<6> op> :
- Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
- // For now left dpp only for asm/dasm
- // TODO: add corresponding pseudo
- def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+let AssemblerPredicates = [isVIOnly] in {
+
+multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName> {
+ def _e32_vi :
+ VOP2_Real<!cast<VOP2_Pseudo>(OpName#"_e32"), SIEncodingFamily.VI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
+ VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
+ let AsmString = AsmName # ps.AsmOperands;
+ let DecoderNamespace = "VI";
+ }
+ def _e64_vi :
+ VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.VI>,
+ VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
+ let AsmString = AsmName # ps.AsmOperands;
+ let DecoderNamespace = "VI";
+ }
+ def _sdwa_vi :
+ VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
+ VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
+ let AsmString = AsmName # ps.AsmOperands;
+ }
+ def _dpp :
+ VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName>;
+}
+}
+
+let AssemblerPredicates = [isGFX9] in {
+
+multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
+ def _e32_gfx9 :
+ VOP2_Real<!cast<VOP2_Pseudo>(OpName#"_e32"), SIEncodingFamily.GFX9>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
+ VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
+ let AsmString = AsmName # ps.AsmOperands;
+ let DecoderNamespace = "GFX9";
+ }
+ def _e64_gfx9 :
+ VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.GFX9>,
+ VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
+ let AsmString = AsmName # ps.AsmOperands;
+ let DecoderNamespace = "GFX9";
+ }
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
+ VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
+ let AsmString = AsmName # ps.AsmOperands;
+ }
+ def _dpp_gfx9 :
+ VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName> {
+ let DecoderNamespace = "SDWA9";
+ }
}
+multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
+ def _e32_gfx9 :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX9>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>{
+ let DecoderNamespace = "GFX9";
+ }
+ def _e64_gfx9 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>,
+ VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ let DecoderNamespace = "GFX9";
+ }
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+ }
+ def _dpp_gfx9 :
+ VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
+ let DecoderNamespace = "SDWA9";
+ }
+}
+
+} // AssemblerPredicates = [isGFX9]
+
multiclass VOP2_Real_e32e64_vi <bits<6> op> :
Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
// For now left dpp only for asm/dasm
@@ -768,12 +849,24 @@ defm V_XOR_B32 : VOP2_Real_e32e64_vi <0x15>;
defm V_MAC_F32 : VOP2_Real_e32e64_vi <0x16>;
defm V_MADMK_F32 : VOP2_Real_MADK_vi <0x17>;
defm V_MADAK_F32 : VOP2_Real_MADK_vi <0x18>;
-defm V_ADD_I32 : VOP2be_Real_e32e64_vi <0x19>;
-defm V_SUB_I32 : VOP2be_Real_e32e64_vi <0x1a>;
-defm V_SUBREV_I32 : VOP2be_Real_e32e64_vi <0x1b>;
-defm V_ADDC_U32 : VOP2be_Real_e32e64_vi <0x1c>;
-defm V_SUBB_U32 : VOP2be_Real_e32e64_vi <0x1d>;
-defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi <0x1e>;
+
+defm V_ADD_U32 : VOP2be_Real_e32e64_vi_only <0x19, "V_ADD_I32", "v_add_u32">;
+defm V_SUB_U32 : VOP2be_Real_e32e64_vi_only <0x1a, "V_SUB_I32", "v_sub_u32">;
+defm V_SUBREV_U32 : VOP2be_Real_e32e64_vi_only <0x1b, "V_SUBREV_I32", "v_subrev_u32">;
+defm V_ADDC_U32 : VOP2be_Real_e32e64_vi_only <0x1c, "V_ADDC_U32", "v_addc_u32">;
+defm V_SUBB_U32 : VOP2be_Real_e32e64_vi_only <0x1d, "V_SUBB_U32", "v_subb_u32">;
+defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi_only <0x1e, "V_SUBBREV_U32", "v_subbrev_u32">;
+
+defm V_ADD_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x19, "V_ADD_I32", "v_add_co_u32">;
+defm V_SUB_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1a, "V_SUB_I32", "v_sub_co_u32">;
+defm V_SUBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1b, "V_SUBREV_I32", "v_subrev_co_u32">;
+defm V_ADDC_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1c, "V_ADDC_U32", "v_addc_co_u32">;
+defm V_SUBB_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1d, "V_SUBB_U32", "v_subb_co_u32">;
+defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_subbrev_co_u32">;
+
+defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>;
+defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>;
+defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>;
defm V_READLANE_B32 : VOP32_Real_vi <0x289>;
defm V_WRITELANE_B32 : VOP32_Real_vi <0x28a>;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 92ed0706dc01..aedbfa015bf6 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -53,6 +53,46 @@ class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
ret1));
}
+class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret3 = [(set P.DstVT:$vdst,
+ (node (P.Src0VT !if(P.HasClamp, (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+ (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))),
+ (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+ list<dag> ret2 = [(set P.DstVT:$vdst,
+ (node !if(P.HasClamp, (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
+ (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))),
+ (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+ list<dag> ret1 = [(set P.DstVT:$vdst,
+ (node (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+
+ list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
+class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret3 = [(set P.DstVT:$vdst,
+ (node (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+ (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
+ (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+ list<dag> ret2 = [(set P.DstVT:$vdst,
+ (node !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
+ (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
+ (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+ list<dag> ret1 = [(set P.DstVT:$vdst,
+ (node (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+
+ list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))];
list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))];
@@ -62,10 +102,36 @@ class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
ret1));
}
+class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i1:$clamp))];
+ list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, i1:$clamp))];
+ list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, i1:$clamp))];
+ list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
VOP3_Pseudo<OpName, P,
- !if(P.HasModifiers, getVOP3ModPat<P, node>.ret, getVOP3Pat<P, node>.ret),
- VOP3Only>;
+ !if(P.HasOpSel,
+ !if(P.HasModifiers,
+ getVOP3OpSelModPat<P, node>.ret,
+ getVOP3OpSelPat<P, node>.ret),
+ !if(P.HasModifiers,
+ getVOP3ModPat<P, node>.ret,
+ !if(P.HasIntClamp,
+ getVOP3ClampPat<P, node>.ret,
+ getVOP3Pat<P, node>.ret))),
+ VOP3Only, 0, P.HasOpSel> {
+
+ let IntClamp = P.HasIntClamp;
+ let AsmMatchConverter =
+ !if(P.HasOpSel,
+ "cvtVOP3OpSel",
+ !if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)),
+ "cvtVOP3",
+ ""));
+}
// Special case for v_div_fmas_{f32|f64}, since it seems to be the
// only VOP instruction that implicitly reads VCC.
@@ -87,10 +153,33 @@ class getVOP3VCC<VOPProfile P, SDPatternOperator node> {
(i1 VCC)))];
}
-class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> {
+class VOP3Features<bit Clamp, bit OpSel> {
+ bit HasClamp = Clamp;
+ bit HasOpSel = OpSel;
+}
+
+def VOP3_REGULAR : VOP3Features<0, 0>;
+def VOP3_CLAMP : VOP3Features<1, 0>;
+def VOP3_OPSEL : VOP3Features<1, 1>;
+
+class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
+
+ let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
+ let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+
// FIXME: Hack to stop printing _e64
let Outs64 = (outs DstRC.RegClass:$vdst);
- let Asm64 = " " # P.Asm64;
+ let Asm64 =
+ " " # !if(Features.HasOpSel,
+ getAsmVOP3OpSel<NumSrcArgs,
+ HasIntClamp,
+ HasSrc0FloatMods,
+ HasSrc1FloatMods,
+ HasSrc2FloatMods>.ret,
+ !if(Features.HasClamp,
+ getAsm64<HasDst, NumSrcArgs, HasIntClamp,
+ HasModifiers, HasOMod, DstVT>.ret,
+ P.Asm64));
}
class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
@@ -112,11 +201,75 @@ def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
}
def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
+ let HasClamp = 1;
+
// FIXME: Hack to stop printing _e64
let DstRC = RegisterOperand<VReg_64>;
let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
- let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
+ let Asm64 = " $vdst, $sdst, $src0, $src1, $src2$clamp";
+}
+
+//===----------------------------------------------------------------------===//
+// VOP3 INTERP
+//===----------------------------------------------------------------------===//
+
+class VOP3Interp<string OpName, VOPProfile P> : VOP3_Pseudo<OpName, P> {
+ let AsmMatchConverter = "cvtVOP3Interp";
+}
+
+def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> {
+ let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
+ Attr:$attr, AttrChan:$attrchan,
+ clampmod:$clamp, omod:$omod);
+
+ let Asm64 = "$vdst, $src0_modifiers, $attr$attrchan$clamp$omod";
+}
+
+def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> {
+ let Ins64 = (ins InterpSlot:$src0,
+ Attr:$attr, AttrChan:$attrchan,
+ clampmod:$clamp, omod:$omod);
+
+ let Asm64 = "$vdst, $src0, $attr$attrchan$clamp$omod";
+
+ let HasClamp = 1;
+}
+
+class getInterp16Asm <bit HasSrc2, bit HasOMod> {
+ string src2 = !if(HasSrc2, ", $src2_modifiers", "");
+ string omod = !if(HasOMod, "$omod", "");
+ string ret =
+ " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod;
+}
+
+class getInterp16Ins <bit HasSrc2, bit HasOMod,
+ Operand Src0Mod, Operand Src2Mod> {
+ dag ret = !if(HasSrc2,
+ !if(HasOMod,
+ (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
+ Attr:$attr, AttrChan:$attrchan,
+ Src2Mod:$src2_modifiers, VRegSrc_32:$src2,
+ highmod:$high, clampmod:$clamp, omod:$omod),
+ (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
+ Attr:$attr, AttrChan:$attrchan,
+ Src2Mod:$src2_modifiers, VRegSrc_32:$src2,
+ highmod:$high, clampmod:$clamp)
+ ),
+ (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
+ Attr:$attr, AttrChan:$attrchan,
+ highmod:$high, clampmod:$clamp, omod:$omod)
+ );
+}
+
+class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
+
+ let HasOMod = !if(!eq(DstVT.Value, f16.Value), 0, 1);
+ let HasHigh = 1;
+
+ let Outs64 = (outs VGPR_32:$vdst);
+ let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod>.ret;
+ let Asm64 = getInterp16Asm<HasSrc2, HasOMod>.ret;
}
//===----------------------------------------------------------------------===//
@@ -127,8 +280,8 @@ let isCommutable = 1 in {
def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
-def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUmad_i24>;
-def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUmad_u24>;
+def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
@@ -188,10 +341,10 @@ def V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDG
def V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
def V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
def V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
-def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_u8>;
-def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_hi_u8>;
-def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_u16>;
-def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
@@ -213,10 +366,10 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64,
let AsmMatchConverter = "";
}
-def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>;
+def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
let Constraints = "@earlyclobber $vdst" in {
-def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_mqsad_pk_u16_u8>;
+def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
} // End Constraints = "@earlyclobber $vdst"
def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
@@ -241,13 +394,15 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
let SubtargetPredicate = isCIVI in {
let Constraints = "@earlyclobber $vdst" in {
-def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
-def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
+def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
+def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>;
} // End Constraints = "@earlyclobber $vdst"
let isCommutable = 1 in {
+let SchedRW = [WriteDouble, WriteSALU] in {
def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
+} // End SchedRW = [WriteDouble, WriteSALU]
} // End isCommutable = 1
} // End SubtargetPredicate = isCIVI
@@ -255,23 +410,42 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
let SubtargetPredicate = Has16BitInsts in {
-def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
+let renamedInGFX9 = 1 in {
+def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
+}
+let SubtargetPredicate = isGFX9 in {
+def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+}
let isCommutable = 1 in {
-def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>;
-def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile<VOP_F32_F32_F16>>;
-def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile<VOP_F32_F32_F16_F16>>;
-def V_INTERP_P2_F16 : VOP3Inst <"v_interp_p2_f16", VOP3_Profile<VOP_F16_F32_F16_F32>>;
-def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
+let renamedInGFX9 = 1 in {
+def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
+def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
+def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
+def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>;
+def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>;
+}
+
+let SubtargetPredicate = isGFX9 in {
+def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+def V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
+def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
+def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
+} // End SubtargetPredicate = isGFX9
-def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
-def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
+def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
+def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
} // End isCommutable = 1
} // End SubtargetPredicate = Has16BitInsts
let SubtargetPredicate = isVI in {
+def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
+def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
+def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
+
def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
} // End SubtargetPredicate = isVI
@@ -279,20 +453,20 @@ let Predicates = [Has16BitInsts] in {
multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
Instruction inst, SDPatternOperator op3> {
-def : Pat<
+def : GCNPat <
(op2 (op1 i16:$src0, i16:$src1), i16:$src2),
- (inst i16:$src0, i16:$src1, i16:$src2)
+ (inst i16:$src0, i16:$src1, i16:$src2, (i1 0))
>;
-def : Pat<
+def : GCNPat<
(i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
- (inst i16:$src0, i16:$src1, i16:$src2)
+ (inst i16:$src0, i16:$src1, i16:$src2, (i1 0))
>;
-def : Pat<
+def : GCNPat<
(i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
(REG_SEQUENCE VReg_64,
- (inst i16:$src0, i16:$src1, i16:$src2), sub0,
+ (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)), sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
}
@@ -303,7 +477,7 @@ defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
} // End Predicates = [Has16BitInsts]
let SubtargetPredicate = isGFX9 in {
-def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16>>;
+def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -313,19 +487,70 @@ def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>;
-def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>;
-def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>;
+def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>;
+def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>;
+def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>;
+
+def V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmin3>;
+def V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmin3>;
+def V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumin3>;
+
+def V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmax3>;
+def V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
+def V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
+
+def V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
+def V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
-def V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmin3>;
-def V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmin3>;
-def V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumin3>;
+def V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+def V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
-def V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmax3>;
-def V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmax3>;
-def V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumax3>;
+def V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
+def V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
+
+def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
} // End SubtargetPredicate = isGFX9
+//===----------------------------------------------------------------------===//
+// Integer Clamp Patterns
+//===----------------------------------------------------------------------===//
+
+class getClampPat<VOPProfile P, SDPatternOperator node> {
+ dag ret3 = (P.DstVT (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2));
+ dag ret2 = (P.DstVT (node P.Src0VT:$src0, P.Src1VT:$src1));
+ dag ret1 = (P.DstVT (node P.Src0VT:$src0));
+ dag ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
+class getClampRes<VOPProfile P, Instruction inst> {
+ dag ret3 = (inst P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, (i1 0));
+ dag ret2 = (inst P.Src0VT:$src0, P.Src1VT:$src1, (i1 0));
+ dag ret1 = (inst P.Src0VT:$src0, (i1 0));
+ dag ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
+class IntClampPat<VOP3Inst inst, SDPatternOperator node> : GCNPat<
+ getClampPat<inst.Pfl, node>.ret,
+ getClampRes<inst.Pfl, inst>.ret
+>;
+
+def : IntClampPat<V_MAD_I32_I24, AMDGPUmad_i24>;
+def : IntClampPat<V_MAD_U32_U24, AMDGPUmad_u24>;
+
+def : IntClampPat<V_SAD_U8, int_amdgcn_sad_u8>;
+def : IntClampPat<V_SAD_HI_U8, int_amdgcn_sad_hi_u8>;
+def : IntClampPat<V_SAD_U16, int_amdgcn_sad_u16>;
+
+def : IntClampPat<V_MSAD_U8, int_amdgcn_msad_u8>;
+def : IntClampPat<V_MQSAD_PK_U16_U8, int_amdgcn_mqsad_pk_u16_u8>;
+
+def : IntClampPat<V_QSAD_PK_U16_U8, int_amdgcn_qsad_pk_u16_u8>;
+def : IntClampPat<V_MQSAD_U32_U8, int_amdgcn_mqsad_u32_u8>;
//===----------------------------------------------------------------------===//
// Target
@@ -443,8 +668,68 @@ multiclass VOP3be_Real_vi<bits<10> op> {
VOP3be_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
}
+multiclass VOP3OpSel_Real_gfx9<bits<10> op> {
+ def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP3Interp_Real_vi<bits<10> op> {
+ def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3Interp_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
+let AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI" in {
+
+multiclass VOP3_F16_Real_vi<bits<10> op> {
+ def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP3Interp_F16_Real_vi<bits<10> op> {
+ def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3Interp_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+} // End AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI"
+
+let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
+
+multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
+ def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
+ VOP3e_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
+ let AsmString = AsmName # ps.AsmOperands;
+ }
+}
+
+multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> {
+ def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
+ VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME);
+ let AsmString = AsmName # ps.AsmOperands;
+ }
+}
+
+multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
+ def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
+ VOP3Interp_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
+ let AsmString = AsmName # ps.AsmOperands;
+ }
+}
+
+multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> {
+ def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
+ VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME);
+ let AsmString = AsmName # ps.AsmOperands;
+ }
+}
+
+} // End AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9"
+
defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;
defm V_MAD_I64_I32 : VOP3be_Real_vi <0x1E9>;
@@ -489,18 +774,38 @@ defm V_QSAD_PK_U16_U8 : VOP3_Real_vi <0x1e5>;
defm V_MQSAD_PK_U16_U8 : VOP3_Real_vi <0x1e6>;
defm V_MQSAD_U32_U8 : VOP3_Real_vi <0x1e7>;
-defm V_MAD_F16 : VOP3_Real_vi <0x1ea>;
-defm V_MAD_U16 : VOP3_Real_vi <0x1eb>;
-defm V_MAD_I16 : VOP3_Real_vi <0x1ec>;
-
defm V_PERM_B32 : VOP3_Real_vi <0x1ed>;
-defm V_FMA_F16 : VOP3_Real_vi <0x1ee>;
-defm V_DIV_FIXUP_F16 : VOP3_Real_vi <0x1ef>;
-
-defm V_INTERP_P1LL_F16 : VOP3_Real_vi <0x274>;
-defm V_INTERP_P1LV_F16 : VOP3_Real_vi <0x275>;
-defm V_INTERP_P2_F16 : VOP3_Real_vi <0x276>;
+defm V_MAD_F16 : VOP3_F16_Real_vi <0x1ea>;
+defm V_MAD_U16 : VOP3_F16_Real_vi <0x1eb>;
+defm V_MAD_I16 : VOP3_F16_Real_vi <0x1ec>;
+defm V_FMA_F16 : VOP3_F16_Real_vi <0x1ee>;
+defm V_DIV_FIXUP_F16 : VOP3_F16_Real_vi <0x1ef>;
+defm V_INTERP_P2_F16 : VOP3Interp_F16_Real_vi <0x276>;
+
+defm V_MAD_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ea, "V_MAD_F16", "v_mad_legacy_f16">;
+defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">;
+defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">;
+defm V_FMA_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ee, "V_FMA_F16", "v_fma_legacy_f16">;
+defm V_DIV_FIXUP_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ef, "V_DIV_FIXUP_F16", "v_div_fixup_legacy_f16">;
+defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16", "v_interp_p2_legacy_f16">;
+
+defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">;
+defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
+defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">;
+defm V_FMA_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x206, "v_fma_f16">;
+defm V_DIV_FIXUP_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x207, "v_div_fixup_f16">;
+defm V_INTERP_P2_F16_gfx9 : VOP3Interp_F16_Real_gfx9 <0x277, "V_INTERP_P2_F16_gfx9", "v_interp_p2_f16">;
+
+defm V_ADD_I32_gfx9 : VOP3_Real_gfx9 <0x29c, "v_add_i32">;
+defm V_SUB_I32_gfx9 : VOP3_Real_gfx9 <0x29d, "v_sub_i32">;
+
+defm V_INTERP_P1_F32_e64 : VOP3Interp_Real_vi <0x270>;
+defm V_INTERP_P2_F32_e64 : VOP3Interp_Real_vi <0x271>;
+defm V_INTERP_MOV_F32_e64 : VOP3Interp_Real_vi <0x272>;
+
+defm V_INTERP_P1LL_F16 : VOP3Interp_Real_vi <0x274>;
+defm V_INTERP_P1LV_F16 : VOP3Interp_Real_vi <0x275>;
defm V_ADD_F64 : VOP3_Real_vi <0x280>;
defm V_MUL_F64 : VOP3_Real_vi <0x281>;
defm V_MIN_F64 : VOP3_Real_vi <0x282>;
@@ -527,18 +832,27 @@ defm V_ADD3_U32 : VOP3_Real_vi <0x1ff>;
defm V_LSHL_OR_B32 : VOP3_Real_vi <0x200>;
defm V_AND_OR_B32 : VOP3_Real_vi <0x201>;
defm V_OR3_B32 : VOP3_Real_vi <0x202>;
-defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>;
+defm V_PACK_B32_F16 : VOP3OpSel_Real_gfx9 <0x2a0>;
defm V_XAD_U32 : VOP3_Real_vi <0x1f3>;
-defm V_MIN3_F16 : VOP3_Real_vi <0x1f4>;
-defm V_MIN3_I16 : VOP3_Real_vi <0x1f5>;
-defm V_MIN3_U16 : VOP3_Real_vi <0x1f6>;
+defm V_MIN3_F16 : VOP3OpSel_Real_gfx9 <0x1f4>;
+defm V_MIN3_I16 : VOP3OpSel_Real_gfx9 <0x1f5>;
+defm V_MIN3_U16 : VOP3OpSel_Real_gfx9 <0x1f6>;
+
+defm V_MAX3_F16 : VOP3OpSel_Real_gfx9 <0x1f7>;
+defm V_MAX3_I16 : VOP3OpSel_Real_gfx9 <0x1f8>;
+defm V_MAX3_U16 : VOP3OpSel_Real_gfx9 <0x1f9>;
+
+defm V_MED3_F16 : VOP3OpSel_Real_gfx9 <0x1fa>;
+defm V_MED3_I16 : VOP3OpSel_Real_gfx9 <0x1fb>;
+defm V_MED3_U16 : VOP3OpSel_Real_gfx9 <0x1fc>;
+
+defm V_ADD_I16 : VOP3OpSel_Real_gfx9 <0x29e>;
+defm V_SUB_I16 : VOP3OpSel_Real_gfx9 <0x29f>;
-defm V_MAX3_F16 : VOP3_Real_vi <0x1f7>;
-defm V_MAX3_I16 : VOP3_Real_vi <0x1f8>;
-defm V_MAX3_U16 : VOP3_Real_vi <0x1f9>;
+defm V_MAD_U32_U16 : VOP3OpSel_Real_gfx9 <0x1f1>;
+defm V_MAD_I32_I16 : VOP3OpSel_Real_gfx9 <0x1f2>;
-defm V_MED3_F16 : VOP3_Real_vi <0x1fa>;
-defm V_MED3_I16 : VOP3_Real_vi <0x1fb>;
-defm V_MED3_U16 : VOP3_Real_vi <0x1fc>;
+defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx9 <0x299>;
+defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>;
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index 3becf758aaa3..eeee8b36c175 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -18,16 +18,25 @@ class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag>
// Non-packed instructions that use the VOP3P encoding.
// VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed.
-class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
+class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0,
+ SDPatternOperator node = null_frag> :
VOP3P_Pseudo<OpName, P> {
+ // These operands are only sort of f16 operands. Depending on
+ // op_sel_hi, these may be interpreted as f32. The inline immediate
+ // values are really f16 converted to f32, so we treat these as f16
+ // operands.
let InOperandList =
- (ins
- FP32InputMods:$src0_modifiers, VCSrc_f32:$src0,
- FP32InputMods:$src1_modifiers, VCSrc_f32:$src1,
- FP32InputMods:$src2_modifiers, VCSrc_f32:$src2,
- clampmod:$clamp,
- op_sel:$op_sel,
- op_sel_hi:$op_sel_hi);
+ !con(
+ !con(
+ (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
+ FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
+ FP16InputMods:$src2_modifiers, VCSrc_f16:$src2,
+ clampmod:$clamp),
+ !if(UseTiedOutput, (ins VGPR_32:$vdst_in), (ins))),
+ (ins op_sel:$op_sel, op_sel_hi:$op_sel_hi));
+
+ let Constraints = !if(UseTiedOutput, "$vdst = $vdst_in", "");
+ let DisableEncoding = !if(UseTiedOutput, "$vdst_in", "");
let AsmOperands =
" $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
}
@@ -59,14 +68,80 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I1
def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
-// XXX - Commutable?
+
+let SubtargetPredicate = HasMadMixInsts in {
// These are VOP3a-like opcodes which accept no omod.
// Size of src arguments (16/32) is controlled by op_sel.
// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
-def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_V2F16_V2F16_V2F16>>;
-def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>;
-def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>;
+let isCommutable = 1 in {
+def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
+
+// Clamp modifier is applied after conversion to f16.
+def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+
+let ClampLo = 0, ClampHi = 1 in {
+def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+}
+}
+
+def : GCNPat <
+ (f16 (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (V_MAD_MIXLO_F16 $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.NONE,
+ (i32 (IMPLICIT_DEF)))
+>;
+
+// FIXME: Special case handling for maxhi (especially for clamp)
+// because dealing with the write to high half of the register is
+// difficult.
+def : GCNPat <
+ (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.NONE,
+ $elt0))
+>;
+
+def : GCNPat <
+ (build_vector
+ f16:$elt0,
+ (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
+ (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.ENABLE,
+ $elt0))
+>;
+
+def : GCNPat <
+ (AMDGPUclamp (build_vector
+ (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
+ (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
+ (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0,
+ $hi_src1_modifiers, $hi_src1,
+ $hi_src2_modifiers, $hi_src2,
+ DSTCLAMP.ENABLE,
+ (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0,
+ $lo_src1_modifiers, $lo_src1,
+ $lo_src2_modifiers, $lo_src2,
+ DSTCLAMP.ENABLE,
+ (i32 (IMPLICIT_DEF)))))
+>;
+} // End SubtargetPredicate = [HasMadMixInsts]
multiclass VOP3P_Real_vi<bits<10> op> {
def _vi : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.VI>,
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index b636fc9be431..146870e21531 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -607,9 +607,7 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
// V_ICMPIntrinsic Pattern.
//===----------------------------------------------------------------------===//
-let Predicates = [isGCN] in {
-
-class ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat <
+class ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : GCNPat <
(AMDGPUsetcc vt:$src0, vt:$src1, cond),
(inst $src0, $src1)
>;
@@ -636,7 +634,7 @@ def : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>;
def : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>;
def : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>;
-class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat <
+class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : GCNPat <
(i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
(vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
(inst $src0_modifiers, $src0, $src1_modifiers, $src1,
@@ -671,8 +669,6 @@ def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>;
def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>;
def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>;
-} // End Predicates = [isGCN]
-
//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index b47538ba0349..f24ff5ce8dea 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -65,8 +65,13 @@ class VOP3Common <dag outs, dag ins, string asm = "",
}
class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
- bit VOP3Only = 0, bit isVOP3P = 0> :
- InstSI <P.Outs64, !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64), "", pattern>,
+ bit VOP3Only = 0, bit isVOP3P = 0, bit isVop3OpSel = 0> :
+ InstSI <P.Outs64,
+ !if(isVop3OpSel,
+ P.InsVOP3OpSel,
+ !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)),
+ "",
+ pattern>,
VOP <opName>,
SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
MnemonicAlias<opName#"_e64", opName> {
@@ -74,9 +79,13 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let isPseudo = 1;
let isCodeGenOnly = 1;
let UseNamedOperandTable = 1;
+ let VOP3_OPSEL = isVop3OpSel;
+ let IsPacked = P.IsPacked;
string Mnemonic = opName;
- string AsmOperands = !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64);
+ string AsmOperands = !if(isVop3OpSel,
+ P.AsmVOP3OpSel,
+ !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
let Size = 8;
let mayLoad = 0;
@@ -98,13 +107,17 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let VOP3 = 1;
let VALU = 1;
let FPClamp = P.HasFPClamp;
+ let IntClamp = P.HasIntClamp;
+ let ClampLo = P.HasClampLo;
+ let ClampHi = P.HasClampHi;
+
let Uses = [EXEC];
let AsmVariantName = AMDGPUAsmVariants.VOP3;
let AsmMatchConverter =
- !if(!and(P.IsPacked, isVOP3P),
+ !if(isVOP3P,
"cvtVOP3P",
- !if(!or(P.HasModifiers, P.HasOMod),
+ !if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)),
"cvtVOP3",
""));
@@ -146,11 +159,11 @@ class VOP3P_Real<VOP3P_Pseudo ps, int EncodingFamily> :
VOP3_Real<ps, EncodingFamily>;
class VOP3a<VOPProfile P> : Enc64 {
- bits<2> src0_modifiers;
+ bits<4> src0_modifiers;
bits<9> src0;
- bits<2> src1_modifiers;
+ bits<3> src1_modifiers;
bits<9> src1;
- bits<2> src2_modifiers;
+ bits<3> src2_modifiers;
bits<9> src2;
bits<1> clamp;
bits<2> omod;
@@ -189,6 +202,32 @@ class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> {
let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
}
+class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
+ let Inst{11} = !if(P.HasSrc0, src0_modifiers{2}, 0);
+ let Inst{12} = !if(P.HasSrc1, src1_modifiers{2}, 0);
+ let Inst{13} = !if(P.HasSrc2, src2_modifiers{2}, 0);
+ let Inst{14} = !if(P.HasDst, src0_modifiers{3}, 0);
+}
+
+// NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa
+class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
+ bits<2> attrchan;
+ bits<6> attr;
+ bits<1> high;
+
+ let Inst{8} = 0; // No modifiers for src0
+ let Inst{61} = 0;
+
+ let Inst{9} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
+ let Inst{62} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
+
+ let Inst{37-32} = attr;
+ let Inst{39-38} = attrchan;
+ let Inst{40} = !if(P.HasHigh, high, 0);
+
+ let Inst{49-41} = src0;
+}
+
class VOP3be <VOPProfile P> : Enc64 {
bits<8> vdst;
bits<2> src0_modifiers;
@@ -476,6 +515,8 @@ class VOP_DPP <string OpName, VOPProfile P> :
let AssemblerPredicate = !if(P.HasExt, HasDPP, DisableInst);
let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
+ let Constraints = !if(P.NumSrcArgs, "$old = $vdst", "");
+ let DisableEncoding = !if(P.NumSrcArgs, "$old", "");
let DecoderNamespace = "DPP";
}