aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h13
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td280
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp144
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp95
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributes.def31
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp266
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp50
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td66
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp64
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td29
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp91
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp439
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h22
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp253
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h17
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp401
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp457
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp78
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp770
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h31
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td158
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp824
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h17
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibFunc.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp27
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp50
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h20
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp211
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp64
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp661
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp140
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp146
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td168
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp166
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp158
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h42
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp88
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp54
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDKernelCodeT.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp1146
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td875
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td544
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp468
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h87
-rw-r--r--llvm/lib/Target/AMDGPU/EXPInstructions.td79
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td1028
-rw-r--r--llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp901
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h25
-rw-r--r--llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/GCNProcessors.td28
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp356
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h36
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h345
-rw-r--r--llvm/lib/Target/AMDGPU/LDSDIRInstructions.td116
-rw-r--r--llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h4
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp257
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h17
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h54
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h1
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp165
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td606
-rw-r--r--llvm/lib/Target/AMDGPU/R600.h2
-rw-r--r--llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/R600FrameLowering.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp175
-rw-r--r--llvm/lib/Target/AMDGPU/R600InstrInfo.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/R600InstrInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp (renamed from llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp)152
-rw-r--r--llvm/lib/Target/AMDGPU/R600Packetizer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/R600Subtarget.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/R600Subtarget.h16
-rw-r--r--llvm/lib/Target/AMDGPU/R600TargetMachine.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/R600TargetMachine.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp58
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h196
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp189
-rw-r--r--llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp230
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp1899
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h24
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp77
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp354
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td83
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp665
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h68
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td625
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td244
-rw-r--r--llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp842
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp42
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp33
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp126
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h179
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp56
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp484
-rw-r--r--llvm/lib/Target/AMDGPU/SIModeRegister.cpp17
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp251
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp110
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp125
-rw-r--r--llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp595
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h28
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td127
-rw-r--r--llvm/lib/Target/AMDGPU/SISchedule.td65
-rw-r--r--llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp435
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp42
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td410
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td425
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp314
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h56
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp684
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h133
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h38
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp (renamed from llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp)102
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h51
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h4
-rw-r--r--llvm/lib/Target/AMDGPU/VIInstrFormats.td2
-rw-r--r--llvm/lib/Target/AMDGPU/VINTERPInstructions.td180
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td376
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td626
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td453
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td669
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td873
-rw-r--r--llvm/lib/Target/AMDGPU/VOPDInstructions.td159
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td658
174 files changed, 21730 insertions, 7370 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 11cc1a01d248..c4680cbedadf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -91,10 +91,6 @@ ModulePass *createAMDGPULowerIntrinsicsPass();
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
extern char &AMDGPULowerIntrinsicsID;
-ModulePass *createAMDGPUFixFunctionBitcastsPass();
-void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &);
-extern char &AMDGPUFixFunctionBitcastsID;
-
ModulePass *createAMDGPUCtorDtorLoweringPass();
void initializeAMDGPUCtorDtorLoweringPass(PassRegistry &);
extern char &AMDGPUCtorDtorLoweringID;
@@ -303,6 +299,12 @@ extern char &SIMemoryLegalizerID;
void initializeSIModeRegisterPass(PassRegistry&);
extern char &SIModeRegisterID;
+void initializeAMDGPUReleaseVGPRsPass(PassRegistry &);
+extern char &AMDGPUReleaseVGPRsID;
+
+void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
+extern char &AMDGPUInsertDelayAluID;
+
void initializeSIInsertHardClausesPass(PassRegistry &);
extern char &SIInsertHardClausesID;
@@ -335,6 +337,9 @@ extern char &GCNNSAReassignID;
void initializeGCNPreRAOptimizationsPass(PassRegistry &);
extern char &GCNPreRAOptimizationsID;
+FunctionPass *createAMDGPUSetWavePriorityPass();
+void initializeAMDGPUSetWavePriorityPass(PassRegistry &);
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 806c0b18637a..48b5814cd482 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -86,6 +86,12 @@ def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts"
"Have s_scratch_* flat memory instructions"
>;
+def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch",
+ "EnableFlatScratch",
+ "true",
+ "Use scratch_* flat memory instructions to access scratch"
+>;
+
def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
"AddNoCarryInsts",
"true",
@@ -171,6 +177,12 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
"VI SGPR initialization bug requiring a fixed SGPR allocation size"
>;
+def FeatureUserSGPRInit16Bug : SubtargetFeature<"user-sgpr-init16-bug",
+ "UserSGPRInit16Bug",
+ "true",
+ "Bug requiring at least 16 user+system SGPRs to be enabled"
+>;
+
def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
"LDSMisalignedBug",
"true",
@@ -307,12 +319,24 @@ def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts",
"Additional instructions for GFX90A+"
>;
+def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts",
+ "GFX940Insts",
+ "true",
+ "Additional instructions for GFX940+"
+>;
+
def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
"GFX10Insts",
"true",
"Additional instructions for GFX10+"
>;
+def FeatureGFX11Insts : SubtargetFeature<"gfx11-insts",
+ "GFX11Insts",
+ "true",
+ "Additional instructions for GFX11+"
+>;
+
def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts",
"GFX10_3Insts",
"true",
@@ -343,6 +367,12 @@ def Feature16BitInsts : SubtargetFeature<"16-bit-insts",
"Has i16/f16 instructions"
>;
+def FeatureTrue16BitInsts : SubtargetFeature<"true16",
+ "HasTrue16BitInsts",
+ "true",
+ "True 16-bit operand instructions"
+>;
+
def FeatureVOP3P : SubtargetFeature<"vop3p",
"HasVOP3PInsts",
"true",
@@ -458,6 +488,12 @@ def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
"Support NSA encoding for image instructions"
>;
+def FeatureImageInsts : SubtargetFeature<"image-insts",
+ "HasImageInsts",
+ "true",
+ "Support image instructions"
+>;
+
def FeatureExtendedImageInsts : SubtargetFeature<"extended-image-insts",
"HasExtendedImageInsts",
"true",
@@ -536,6 +572,13 @@ def FeatureDot7Insts : SubtargetFeature<"dot7-insts",
"Has v_dot2_f32_f16, v_dot4_u32_u8, v_dot8_u32_u4 instructions"
>;
+def FeatureDot8Insts : SubtargetFeature<"dot8-insts",
+ "HasDot8Insts",
+ "true",
+ "Has v_dot2_f16_f16, v_dot2_bf16_bf16, v_dot2_f32_bf16, "
+ "v_dot4_i32_iu8, v_dot8_i32_iu4 instructions"
+>;
+
def FeatureMAIInsts : SubtargetFeature<"mai-insts",
"HasMAIInsts",
"true",
@@ -548,11 +591,28 @@ def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
"Has v_pk_fmac_f16 instruction"
>;
-def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts",
- "HasAtomicFaddInsts",
+def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts",
+ "HasAtomicFaddRtnInsts",
"true",
- "Has buffer_atomic_add_f32, buffer_atomic_pk_add_f16, global_atomic_add_f32, "
- "global_atomic_pk_add_f16 instructions",
+ "Has buffer_atomic_add_f32 and global_atomic_add_f32 instructions that "
+ "return original value",
+ [FeatureFlatGlobalInsts]
+>;
+
+def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts",
+ "HasAtomicFaddNoRtnInsts",
+ "true",
+ "Has buffer_atomic_add_f32 and global_atomic_add_f32 instructions that "
+ "don't return original value",
+ [FeatureFlatGlobalInsts]
+>;
+
+def FeatureAtomicPkFaddNoRtnInsts
+ : SubtargetFeature<"atomic-pk-fadd-no-rtn-insts",
+ "HasAtomicPkFaddNoRtnInsts",
+ "true",
+ "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that "
+ "don't return original value",
[FeatureFlatGlobalInsts]
>;
@@ -632,6 +692,12 @@ class SubtargetFeatureNSAMaxSize <int Value> : SubtargetFeature <
def FeatureNSAMaxSize5 : SubtargetFeatureNSAMaxSize<5>;
def FeatureNSAMaxSize13 : SubtargetFeatureNSAMaxSize<13>;
+def FeatureVOPD : SubtargetFeature<"vopd",
+ "HasVOPDInsts",
+ "true",
+ "Has VOPD dual issue wave32 instructions"
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -762,7 +828,7 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
[FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
- FeatureTrigReducedRange, FeatureExtendedImageInsts
+ FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts
]
>;
@@ -772,7 +838,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureWavefrontSize64, FeatureFlatAddressSpace,
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
- FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess
+ FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
+ FeatureImageInsts
]
>;
@@ -787,7 +854,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
- FeatureUnalignedBufferAccess
+ FeatureUnalignedBufferAccess, FeatureImageInsts
]
>;
@@ -824,6 +891,25 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureGFX10A16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
+ FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts
+ ]
+>;
+
+def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
+ "gfx11",
+ [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
+ FeatureFlatAddressSpace, Feature16BitInsts,
+ FeatureInv2PiInlineImm, FeatureApertureRegs,
+ FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
+ FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts,
+ FeatureGFX11Insts, FeatureVOP3P, FeatureVOPD, FeatureTrue16BitInsts,
+ FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
+ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
+ FeatureAddNoCarryInsts, FeatureFmaMixInsts,
+ FeatureNoSdstCMPX, FeatureVscnt,
+ FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
+ FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
+ FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess
]
>;
@@ -910,6 +996,7 @@ def FeatureISAVersion9_0_0 : FeatureSet<
FeatureLDSBankCount32,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
+ FeatureImageInsts,
FeatureMadMacF32Insts,
FeatureImageGather4D16Bug]>;
@@ -919,6 +1006,7 @@ def FeatureISAVersion9_0_2 : FeatureSet<
FeatureLDSBankCount32,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
+ FeatureImageInsts,
FeatureMadMacF32Insts,
FeatureImageGather4D16Bug]>;
@@ -927,6 +1015,7 @@ def FeatureISAVersion9_0_4 : FeatureSet<
FeatureLDSBankCount32,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
+ FeatureImageInsts,
FeatureMadMacF32Insts,
FeatureFmaMixInsts,
FeatureImageGather4D16Bug]>;
@@ -938,6 +1027,7 @@ def FeatureISAVersion9_0_6 : FeatureSet<
FeatureLDSBankCount32,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
+ FeatureImageInsts,
FeatureMadMacF32Insts,
FeatureDLInsts,
FeatureDot1Insts,
@@ -953,6 +1043,7 @@ def FeatureISAVersion9_0_8 : FeatureSet<
FeatureLDSBankCount32,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
+ FeatureImageInsts,
FeatureMadMacF32Insts,
FeatureDLInsts,
FeatureDot1Insts,
@@ -964,7 +1055,8 @@ def FeatureISAVersion9_0_8 : FeatureSet<
FeatureDot7Insts,
FeatureMAIInsts,
FeaturePkFmacF16Inst,
- FeatureAtomicFaddInsts,
+ FeatureAtomicFaddNoRtnInsts,
+ FeatureAtomicPkFaddNoRtnInsts,
FeatureSupportsSRAMECC,
FeatureMFMAInlineLiteralBug,
FeatureImageGather4D16Bug]>;
@@ -975,6 +1067,7 @@ def FeatureISAVersion9_0_9 : FeatureSet<
FeatureLDSBankCount32,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
+ FeatureImageInsts,
FeatureMadMacF32Insts,
FeatureImageGather4D16Bug]>;
@@ -995,7 +1088,10 @@ def FeatureISAVersion9_0_A : FeatureSet<
FeaturePackedFP32Ops,
FeatureMAIInsts,
FeaturePkFmacF16Inst,
- FeatureAtomicFaddInsts,
+ FeatureAtomicFaddRtnInsts,
+ FeatureAtomicFaddNoRtnInsts,
+ FeatureAtomicPkFaddNoRtnInsts,
+ FeatureImageInsts,
FeatureMadMacF32Insts,
FeatureSupportsSRAMECC,
FeaturePackedTID,
@@ -1007,9 +1103,36 @@ def FeatureISAVersion9_0_C : FeatureSet<
FeatureLDSBankCount32,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
+ FeatureImageInsts,
FeatureMadMacF32Insts,
FeatureImageGather4D16Bug]>;
+def FeatureISAVersion9_4_0 : FeatureSet<
+ [FeatureGFX9,
+ FeatureGFX90AInsts,
+ FeatureGFX940Insts,
+ FeatureFmaMixInsts,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDot3Insts,
+ FeatureDot4Insts,
+ FeatureDot5Insts,
+ FeatureDot6Insts,
+ FeatureDot7Insts,
+ Feature64BitDPP,
+ FeaturePackedFP32Ops,
+ FeatureMAIInsts,
+ FeaturePkFmacF16Inst,
+ FeatureAtomicFaddRtnInsts,
+ FeatureAtomicFaddNoRtnInsts,
+ FeatureAtomicPkFaddNoRtnInsts,
+ FeatureSupportsSRAMECC,
+ FeaturePackedTID,
+ FeatureArchitectedFlatScratch,
+ FullRate64Ops]>;
+
// TODO: Organize more features into groups.
def FeatureGroup {
// Bugs present on gfx10.1.
@@ -1124,6 +1247,33 @@ def FeatureISAVersion10_3_0 : FeatureSet<
FeatureWavefrontSize32,
FeatureShaderCyclesRegister]>;
+def FeatureISAVersion11_Common : FeatureSet<
+ [FeatureGFX11,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureDot5Insts,
+ FeatureDot7Insts,
+ FeatureDot8Insts,
+ FeatureNSAEncoding,
+ FeatureNSAMaxSize5,
+ FeatureWavefrontSize32,
+ FeatureShaderCyclesRegister,
+ FeatureArchitectedFlatScratch,
+ FeatureAtomicFaddRtnInsts,
+ FeatureAtomicFaddNoRtnInsts,
+ FeatureImageInsts,
+ FeaturePackedTID,
+ FeatureVcmpxPermlaneHazard]>;
+
+// Features for GFX 11.0.0 and 11.0.1
+def FeatureISAVersion11_0 : FeatureSet<
+ !listconcat(FeatureISAVersion11_Common.Features,
+ [FeatureUserSGPRInit16Bug])>;
+
+def FeatureISAVersion11_0_2 : FeatureSet<
+ !listconcat(FeatureISAVersion11_Common.Features,
+ [FeatureUserSGPRInit16Bug])>;
+
//===----------------------------------------------------------------------===//
def AMDGPUInstrInfo : InstrInfo {
@@ -1152,8 +1302,10 @@ def AMDGPUAsmVariants {
int SDWA9_ID = 3;
string DPP = "DPP";
int DPP_ID = 4;
+ string VOP3_DPP = "VOP3_DPP";
+ int VOP3_DPP_ID = 5;
string Disable = "Disable";
- int Disable_ID = 5;
+ int Disable_ID = 6;
}
def DefaultAMDGPUAsmParserVariant : AsmParserVariant {
@@ -1176,12 +1328,16 @@ def SDWA9AsmParserVariant : AsmParserVariant {
let Name = AMDGPUAsmVariants.SDWA9;
}
-
def DPPAsmParserVariant : AsmParserVariant {
let Variant = AMDGPUAsmVariants.DPP_ID;
let Name = AMDGPUAsmVariants.DPP;
}
+def VOP3_DPPAsmParserVariant : AsmParserVariant {
+ let Variant = AMDGPUAsmVariants.VOP3_DPP_ID;
+ let Name = AMDGPUAsmVariants.VOP3_DPP;
+}
+
def AMDGPU : Target {
// Pull in Instruction Info:
let InstructionSet = AMDGPUInstrInfo;
@@ -1190,7 +1346,8 @@ def AMDGPU : Target {
VOP3AsmParserVariant,
SDWAAsmParserVariant,
SDWA9AsmParserVariant,
- DPPAsmParserVariant];
+ DPPAsmParserVariant,
+ VOP3_DPPAsmParserVariant];
let AssemblyWriters = [AMDGPUAsmWriter];
let AllowRegisterRenaming = 1;
}
@@ -1216,6 +1373,12 @@ def isGFX6GFX7GFX10 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+ AssemblerPredicate<(all_of (not FeatureGCN3Encoding), (not FeatureGFX11Insts))>;
+
+def isGFX6GFX7GFX10Plus :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">,
AssemblerPredicate<(all_of (not FeatureGCN3Encoding))>;
def isGFX7Only :
@@ -1225,6 +1388,12 @@ def isGFX7Only :
def isGFX7GFX10 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+ AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts, (not FeatureGFX11Insts))>;
+
+def isGFX7GFX10GFX11 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10 ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">,
AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts)>;
def isGFX7GFX8GFX9 :
@@ -1248,6 +1417,21 @@ def isGFX6GFX7GFX8GFX9NotGFX90A :
" Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
AssemblerPredicate<(all_of (not FeatureGFX10Insts), (not FeatureGFX90AInsts))>;
+def isGFX6GFX7GFX8GFX9GFX10 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+ AssemblerPredicate<(all_of (not FeatureGFX11Insts))>;
+
+def isGFX7GFX8GFX9GFX10 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+ AssemblerPredicate<(all_of FeatureCIInsts, (not FeatureGFX11Insts))>;
+
def isGFX7Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate<(all_of FeatureCIInsts)>;
@@ -1287,18 +1471,37 @@ def isGFX8GFX9NotGFX90A :
AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
def isGFX90AOnly :
- Predicate<"Subtarget->hasGFX90AInsts()">,
- AssemblerPredicate<(all_of FeatureGFX90AInsts)>;
+ Predicate<"Subtarget->hasGFX90AInsts() && !Subtarget->hasGFX940Insts()">,
+ AssemblerPredicate<(all_of FeatureGFX90AInsts, (not FeatureGFX940Insts))>;
def isGFX908orGFX90A :
- Predicate<"Subtarget->hasMAIInsts()">,
- AssemblerPredicate<(all_of FeatureMAIInsts)>;
+ Predicate<"Subtarget->hasMAIInsts() && !Subtarget->hasGFX940Insts()">,
+ AssemblerPredicate<(all_of FeatureMAIInsts, (not FeatureGFX940Insts))>;
+
+def isGFX940Plus :
+ Predicate<"Subtarget->hasGFX940Insts()">,
+ AssemblerPredicate<(all_of FeatureGFX940Insts)>;
+
+def isGFX940GFX11Plus :
+ Predicate<"Subtarget->hasGFX940Insts() ||"
+ "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11">,
+ AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>;
+
+def isGFX8GFX9NotGFX940 :
+ Predicate<"!Subtarget->hasGFX940Insts() &&"
+ "(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
+ AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX940Insts))>;
def isGFX8GFX9 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding)>;
+def isGFX10Only :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+ AssemblerPredicate<(all_of FeatureGFX10Insts, (not FeatureGFX11Insts))>;
+
def isGFX10Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">,
AssemblerPredicate<(all_of FeatureGFX10Insts)>;
@@ -1308,6 +1511,25 @@ def isGFX10Before1030 :
"!Subtarget->hasGFX10_3Insts()">,
AssemblerPredicate<(all_of FeatureGFX10Insts,(not FeatureGFX10_3Insts))>;
+def isGFX9GFX10 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+ AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureGFX11Insts))>;
+
+def isGFX8GFX9GFX10 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+ AssemblerPredicate<(all_of FeatureGFX8Insts, (not FeatureGFX11Insts))>;
+
+def isGFX11Only :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">,
+ AssemblerPredicate<(all_of FeatureGFX11Insts)>;
+
+def isGFX11Plus :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11">,
+ AssemblerPredicate<(all_of FeatureGFX11Insts)>;
+
def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
@@ -1321,7 +1543,9 @@ def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
AssemblerPredicate<(all_of FeatureGFX9Insts)>;
def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
- AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
+ AssemblerPredicate<(any_of FeatureGFX10_3Insts, FeatureGFX940Insts)>;
+def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">,
+ AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>;
def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">,
AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>;
@@ -1354,6 +1578,11 @@ def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">;
def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
AssemblerPredicate<(all_of Feature16BitInsts)>;
+
+def HasTrue16BitInsts : Predicate<"Subtarget->hasTrue16BitInsts()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts)>;
+def NotHasTrue16BitInsts : Predicate<"!Subtarget->hasTrue16BitInsts()">;
+
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<(all_of FeatureVOP3P)>;
@@ -1385,7 +1614,10 @@ def HasPackedFP32Ops : Predicate<"Subtarget->hasPackedFP32Ops()">,
def HasFmaakFmamkF32Insts :
Predicate<"Subtarget->hasFmaakFmamkF32Insts()">,
- AssemblerPredicate<(any_of FeatureGFX10Insts)>;
+ AssemblerPredicate<(any_of FeatureGFX10Insts, FeatureGFX940Insts)>;
+
+def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">,
+ AssemblerPredicate<(all_of FeatureImageInsts)>;
def HasExtendedImageInsts : Predicate<"Subtarget->hasExtendedImageInsts()">,
AssemblerPredicate<(all_of FeatureExtendedImageInsts)>;
@@ -1454,6 +1686,9 @@ def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">,
def HasDot7Insts : Predicate<"Subtarget->hasDot7Insts()">,
AssemblerPredicate<(all_of FeatureDot7Insts)>;
+def HasDot8Insts : Predicate<"Subtarget->hasDot8Insts()">,
+ AssemblerPredicate<(all_of FeatureDot8Insts)>;
+
def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
@@ -1478,8 +1713,13 @@ def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">,
def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">,
AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
-def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">,
- AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>;
+def HasAtomicFaddRtnInsts : Predicate<"Subtarget->hasAtomicFaddRtnInsts()">,
+ AssemblerPredicate<(all_of FeatureAtomicFaddRtnInsts)>;
+def HasAtomicFaddNoRtnInsts : Predicate<"Subtarget->hasAtomicFaddNoRtnInsts()">,
+ AssemblerPredicate<(all_of FeatureAtomicFaddNoRtnInsts)>;
+def HasAtomicPkFaddNoRtnInsts
+ : Predicate<"Subtarget->hasAtomicPkFaddNoRtnInsts()">,
+ AssemblerPredicate<(all_of FeatureAtomicPkFaddNoRtnInsts)>;
def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index bebf032b5535..74be0336851c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -14,12 +14,11 @@
#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallSet.h"
+#include "Utils/AMDGPUMemoryUtils.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
#define DEBUG_TYPE "amdgpu-annotate-uniform"
@@ -33,8 +32,18 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
LegacyDivergenceAnalysis *DA;
MemorySSA *MSSA;
AliasAnalysis *AA;
- DenseMap<Value*, GetElementPtrInst*> noClobberClones;
bool isEntryFunc;
+ bool Changed;
+
+ void setUniformMetadata(Instruction *I) {
+ I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
+ Changed = true;
+ }
+
+ void setNoClobberMetadata(Instruction *I) {
+ I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
+ Changed = true;
+ }
public:
static char ID;
@@ -54,7 +63,6 @@ public:
void visitBranchInst(BranchInst &I);
void visitLoadInst(LoadInst &I);
- bool isClobberedInFunction(LoadInst * Load);
};
} // End anonymous namespace
@@ -69,88 +77,6 @@ INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
char AMDGPUAnnotateUniformValues::ID = 0;
-static void setUniformMetadata(Instruction *I) {
- I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
-}
-static void setNoClobberMetadata(Instruction *I) {
- I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
-}
-
-bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) {
- MemorySSAWalker *Walker = MSSA->getWalker();
- SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
- SmallSet<MemoryAccess *, 8> Visited;
- MemoryLocation Loc(MemoryLocation::get(Load));
-
- const auto isReallyAClobber = [this, Load](MemoryDef *Def) -> bool {
- Instruction *DefInst = Def->getMemoryInst();
- LLVM_DEBUG(dbgs() << " Def: " << *DefInst << '\n');
-
- if (isa<FenceInst>(DefInst))
- return false;
-
- if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
- switch (II->getIntrinsicID()) {
- case Intrinsic::amdgcn_s_barrier:
- case Intrinsic::amdgcn_wave_barrier:
- return false;
- default:
- break;
- }
- }
-
- // Ignore atomics not aliasing with the original load, any atomic is a
- // universal MemoryDef from MSSA's point of view too, just like a fence.
- const auto checkNoAlias = [this, Load](auto I) -> bool {
- return I && AA->isNoAlias(I->getPointerOperand(),
- Load->getPointerOperand());
- };
-
- if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
- checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
- return false;
-
- return true;
- };
-
- LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
-
- // Start with a nearest dominating clobbering access, it will be either
- // live on entry (nothing to do, load is not clobbered), MemoryDef, or
- // MemoryPhi if several MemoryDefs can define this memory state. In that
- // case add all Defs to WorkList and continue going up and checking all
- // the definitions of this memory location until the root. When all the
- // defs are exhausted and came to the entry state we have no clobber.
- // Along the scan ignore barriers and fences which are considered clobbers
- // by the MemorySSA, but not really writing anything into the memory.
- while (!WorkList.empty()) {
- MemoryAccess *MA = WorkList.pop_back_val();
- if (!Visited.insert(MA).second)
- continue;
-
- if (MSSA->isLiveOnEntryDef(MA))
- continue;
-
- if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
- if (isReallyAClobber(Def)) {
- LLVM_DEBUG(dbgs() << " -> load is clobbered\n");
- return true;
- }
-
- WorkList.push_back(
- Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
- continue;
- }
-
- const MemoryPhi *Phi = cast<MemoryPhi>(MA);
- for (auto &Use : Phi->incoming_values())
- WorkList.push_back(cast<MemoryAccess>(&Use));
- }
-
- LLVM_DEBUG(dbgs() << " -> no clobber\n");
- return false;
-}
-
void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
if (DA->isUniform(&I))
setUniformMetadata(&I);
@@ -160,46 +86,18 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
Value *Ptr = I.getPointerOperand();
if (!DA->isUniform(Ptr))
return;
+ Instruction *PtrI = dyn_cast<Instruction>(Ptr);
+ if (PtrI)
+ setUniformMetadata(PtrI);
+
// We're tracking up to the Function boundaries, and cannot go beyond because
// of FunctionPass restrictions. We can ensure that is memory not clobbered
// for memory operations that are live in to entry points only.
- Instruction *PtrI = dyn_cast<Instruction>(Ptr);
-
- if (!isEntryFunc) {
- if (PtrI)
- setUniformMetadata(PtrI);
+ if (!isEntryFunc)
return;
- }
-
- bool NotClobbered = false;
bool GlobalLoad = I.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
- if (PtrI)
- NotClobbered = GlobalLoad && !isClobberedInFunction(&I);
- else if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
- if (GlobalLoad && !isClobberedInFunction(&I)) {
- NotClobbered = true;
- // Lookup for the existing GEP
- if (noClobberClones.count(Ptr)) {
- PtrI = noClobberClones[Ptr];
- } else {
- // Create GEP of the Value
- Function *F = I.getParent()->getParent();
- Value *Idx = Constant::getIntegerValue(
- Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
- // Insert GEP at the entry to make it dominate all uses
- PtrI = GetElementPtrInst::Create(I.getType(), Ptr,
- ArrayRef<Value *>(Idx), Twine(""),
- F->getEntryBlock().getFirstNonPHI());
- }
- I.replaceUsesOfWith(Ptr, PtrI);
- }
- }
-
- if (PtrI) {
- setUniformMetadata(PtrI);
- if (NotClobbered)
- setNoClobberMetadata(PtrI);
- }
+ if (GlobalLoad && !AMDGPU::isClobberedInFunction(&I, MSSA, AA))
+ setNoClobberMetadata(&I);
}
bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
@@ -215,9 +113,9 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
+ Changed = false;
visit(F);
- noClobberClones.clear();
- return true;
+ return Changed;
}
FunctionPass *
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 6e2984f2a04f..57a4660bc1eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -27,6 +27,8 @@
#include "SIMachineFunctionInfo.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
@@ -34,6 +36,7 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/TargetParser.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
@@ -111,6 +114,12 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
}
void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
+ IsTargetStreamerInitialized = false;
+}
+
+void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
+ IsTargetStreamerInitialized = true;
+
// TODO: Which one is called first, emitStartOfAsmFile or
// emitFunctionBodyStart?
if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
@@ -143,6 +152,10 @@ void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
}
void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
+ // Init target streamer if it has not yet happened
+ if (!IsTargetStreamerInitialized)
+ initTargetStreamer(M);
+
// Following code requires TargetStreamer to be present.
if (!getTargetStreamer())
return;
@@ -234,8 +247,8 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
auto &ObjectFileInfo = *Context.getObjectFileInfo();
auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
- Streamer.PushSection();
- Streamer.SwitchSection(&ReadOnlySection);
+ Streamer.pushSection();
+ Streamer.switchSection(&ReadOnlySection);
// CP microcode requires the kernel descriptor to be allocated on 64 byte
// alignment.
@@ -256,7 +269,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
CurrentProgramInfo.FlatUsed),
CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
- Streamer.PopSection();
+ Streamer.popSection();
}
void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
@@ -319,7 +332,7 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
const DataLayout &DL = GV->getParent()->getDataLayout();
uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
- Align Alignment = GV->getAlign().getValueOr(Align(4));
+ Align Alignment = GV->getAlign().value_or(Align(4));
emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
emitLinkage(GV, GVSym);
@@ -339,7 +352,7 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
(STI.getTargetTriple().getOS() == Triple::AMDHSA ||
STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
- OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+ OutStreamer->switchSection(getObjFileLowering().getTextSection());
getTargetStreamer()->EmitCodeEnd(STI);
}
@@ -381,7 +394,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
}
- if (MFI.hasQueuePtr()) {
+ if (MFI.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
}
@@ -437,6 +450,11 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
}
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ // Init target streamer lazily on the first function so that previous passes
+ // can set metadata.
+ if (!IsTargetStreamerInitialized)
+ initTargetStreamer(*MF.getFunction().getParent());
+
ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
CurrentProgramInfo = SIProgramInfo();
@@ -454,7 +472,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
MCSectionELF *ConfigSection =
Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
- OutStreamer->SwitchSection(ConfigSection);
+ OutStreamer->switchSection(ConfigSection);
}
if (MFI->isModuleEntryFunction()) {
@@ -491,7 +509,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
if (isVerbose()) {
MCSectionELF *CommentSection =
Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
- OutStreamer->SwitchSection(CommentSection);
+ OutStreamer->switchSection(CommentSection);
if (!MFI->isEntryFunction()) {
OutStreamer->emitRawComment(" Function info:", false);
@@ -590,7 +608,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
if (DumpCodeInstEmitter) {
- OutStreamer->SwitchSection(
+ OutStreamer->switchSection(
Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
for (size_t i = 0; i < DisasmLines.size(); ++i) {
@@ -677,7 +695,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
const uint64_t MaxScratchPerWorkitem =
- GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize();
+ STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
ProgInfo.ScratchSize,
@@ -857,22 +875,18 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
LDSAlignShift = 9;
}
- unsigned LDSSpillSize =
- MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize();
-
- ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
+ ProgInfo.LDSSize = MFI->getLDSSize();
ProgInfo.LDSBlocks =
alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
- // Scratch is allocated in 256 dword blocks.
- unsigned ScratchAlignShift = 10;
+ // Scratch is allocated in 64-dword or 256-dword blocks.
+ unsigned ScratchAlignShift =
+ STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
// We need to program the hardware with the amount of scratch memory that
// is used by the entire wave. ProgInfo.ScratchSize is the amount of
// scratch memory used per thread.
- ProgInfo.ScratchBlocks =
- alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
- 1ULL << ScratchAlignShift) >>
- ScratchAlignShift;
+ ProgInfo.ScratchBlocks = divideCeil(
+ ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
@@ -886,8 +900,14 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
else if (MFI->hasWorkItemIDY())
TIDIGCompCnt = 1;
+ // The private segment wave byte offset is the last of the system SGPRs. We
+ // initially assumed it was allocated, and may have used it. It shouldn't harm
+ // anything to disable it if we know the stack isn't used here. We may still
+ // have emitted code reading it to initialize scratch, but if that's unused
+ // reading garbage should be OK.
+ const bool EnablePrivateSegment = ProgInfo.ScratchBlocks > 0;
ProgInfo.ComputePGMRSrc2 =
- S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
+ S_00B84C_SCRATCH_EN(EnablePrivateSegment) |
S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
@@ -931,6 +951,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
const SIProgramInfo &CurrentProgramInfo) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
@@ -942,7 +963,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
- OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks));
+ OutStreamer->emitInt32(
+ STM.getGeneration() >= AMDGPUSubtarget::GFX11
+ ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+ : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
// 0" comment but I don't see a corresponding field in the register spec.
@@ -951,14 +975,18 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
- OutStreamer->emitIntValue(
- S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
+ OutStreamer->emitInt32(
+ STM.getGeneration() >= AMDGPUSubtarget::GFX11
+ ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+ : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
}
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
- OutStreamer->emitInt32(
- S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
+ unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
+ ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
+ : CurrentProgramInfo.LDSBlocks;
+ OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
OutStreamer->emitInt32(MFI->getPSInputEnable());
OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
@@ -984,6 +1012,13 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
MD->setEntryPoint(CC, MF.getFunction().getName());
MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+
+ // Only set AGPRs for supported devices
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ if (STM.hasMAIInsts()) {
+ MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
+ }
+
MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
if (AMDGPU::isCompute(CC)) {
@@ -995,12 +1030,14 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
// ScratchSize is in bytes, 16 aligned.
MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
- MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
+ unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
+ ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
+ : CurrentProgramInfo.LDSBlocks;
+ MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
MD->setSpiPsInputEna(MFI->getPSInputEnable());
MD->setSpiPsInputAddr(MFI->getPSInputAddr());
}
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
if (STM.isWave32())
MD->setWave32(MF.getFunction().getCallingConv());
}
@@ -1067,7 +1104,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (MFI->hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
- if (MFI->hasQueuePtr())
+ if (MFI->hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
if (MFI->hasKernargSegmentPtr())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index d5c60aa3be7d..ddda2cf107b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -77,6 +77,8 @@ private:
const MachineFunction &MF,
const SIProgramInfo &PI) const;
+ void initTargetStreamer(Module &M);
+
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer);
@@ -132,6 +134,7 @@ protected:
std::vector<std::string> DisasmLines, HexLines;
size_t DisasmLineMaxLen;
+ bool IsTargetStreamerInitialized;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 1e2cf3890d0a..3ccfd9dde269 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -311,6 +311,12 @@ Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
if (ST->isWave32())
return V;
+ if (ST->hasPermLane64()) {
+ // Reduce across the upper and lower 32 lanes.
+ return buildNonAtomicBinOp(
+ B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V));
+ }
+
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
new file mode 100644
index 000000000000..0a2cf3874245
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
@@ -0,0 +1,31 @@
+//===--- AMDGPUAttributes.def ---------------------------------*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains descriptions of the various function attributes
+// that indicate *absence* of the corresponding implicit kernel
+// arguments.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+AMDGPU_ATTRIBUTE(DISPATCH_PTR, "amdgpu-no-dispatch-ptr")
+AMDGPU_ATTRIBUTE(QUEUE_PTR, "amdgpu-no-queue-ptr")
+AMDGPU_ATTRIBUTE(DISPATCH_ID, "amdgpu-no-dispatch-id")
+AMDGPU_ATTRIBUTE(IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr")
+AMDGPU_ATTRIBUTE(MULTIGRID_SYNC_ARG, "amdgpu-no-multigrid-sync-arg")
+AMDGPU_ATTRIBUTE(HOSTCALL_PTR, "amdgpu-no-hostcall-ptr")
+AMDGPU_ATTRIBUTE(HEAP_PTR, "amdgpu-no-heap-ptr")
+AMDGPU_ATTRIBUTE(WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x")
+AMDGPU_ATTRIBUTE(WORKGROUP_ID_Y, "amdgpu-no-workgroup-id-y")
+AMDGPU_ATTRIBUTE(WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z")
+AMDGPU_ATTRIBUTE(WORKITEM_ID_X, "amdgpu-no-workitem-id-x")
+AMDGPU_ATTRIBUTE(WORKITEM_ID_Y, "amdgpu-no-workitem-id-y")
+AMDGPU_ATTRIBUTE(WORKITEM_ID_Z, "amdgpu-no-workitem-id-z")
+
+#undef AMDGPU_ATTRIBUTE
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index b4ebc7d7d75f..8de0d7e6bff1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -12,6 +12,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
@@ -22,37 +23,25 @@
using namespace llvm;
-enum ImplicitArgumentMask {
- NOT_IMPLICIT_INPUT = 0,
+#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
+
+enum ImplicitArgumentPositions {
+ #include "AMDGPUAttributes.def"
+ LAST_ARG_POS
+};
- // SGPRs
- DISPATCH_PTR = 1 << 0,
- QUEUE_PTR = 1 << 1,
- DISPATCH_ID = 1 << 2,
- IMPLICIT_ARG_PTR = 1 << 3,
- WORKGROUP_ID_X = 1 << 4,
- WORKGROUP_ID_Y = 1 << 5,
- WORKGROUP_ID_Z = 1 << 6,
+#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
- // VGPRS:
- WORKITEM_ID_X = 1 << 7,
- WORKITEM_ID_Y = 1 << 8,
- WORKITEM_ID_Z = 1 << 9,
- ALL_ARGUMENT_MASK = (1 << 10) - 1
+enum ImplicitArgumentMask {
+ NOT_IMPLICIT_INPUT = 0,
+ #include "AMDGPUAttributes.def"
+ ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
};
+#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
static constexpr std::pair<ImplicitArgumentMask,
StringLiteral> ImplicitAttrs[] = {
- {DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
- {QUEUE_PTR, "amdgpu-no-queue-ptr"},
- {DISPATCH_ID, "amdgpu-no-dispatch-id"},
- {IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
- {WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
- {WORKGROUP_ID_Y, "amdgpu-no-workgroup-id-y"},
- {WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z"},
- {WORKITEM_ID_X, "amdgpu-no-workitem-id-x"},
- {WORKITEM_ID_Y, "amdgpu-no-workitem-id-y"},
- {WORKITEM_ID_Z, "amdgpu-no-workitem-id-z"}
+ #include "AMDGPUAttributes.def"
};
// We do not need to note the x workitem or workgroup id because they are always
@@ -61,7 +50,9 @@ static constexpr std::pair<ImplicitArgumentMask,
// TODO: We should not add the attributes if the known compile time workgroup
// size is 1 for y/z.
static ImplicitArgumentMask
-intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) {
+intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
+ bool HasApertureRegs, bool SupportsGetDoorBellID) {
+ unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion();
switch (ID) {
case Intrinsic::amdgcn_workitem_id_x:
NonKernelOnly = true;
@@ -87,13 +78,23 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) {
return DISPATCH_ID;
case Intrinsic::amdgcn_implicitarg_ptr:
return IMPLICIT_ARG_PTR;
+ // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
+ // queue_ptr.
case Intrinsic::amdgcn_queue_ptr:
+ NeedsImplicit = (CodeObjectVersion == 5);
+ return QUEUE_PTR;
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private:
- // TODO: Does not require queue ptr on gfx9+
+ if (HasApertureRegs)
+ return NOT_IMPLICIT_INPUT;
+ // Under V5, we need implicitarg_ptr + offsets to access private_base or
+ // shared_base. For pre-V5, however, need to access them through queue_ptr +
+ // offsets.
+ return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR;
case Intrinsic::trap:
- case Intrinsic::debugtrap:
- IsQueuePtr = true;
+ if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
+ return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR;
+ NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5.
return QUEUE_PTR;
default:
return NOT_IMPLICIT_INPUT;
@@ -114,7 +115,7 @@ static bool isDSAddress(const Constant *C) {
/// Returns true if the function requires the implicit argument be passed
/// regardless of the function contents.
-static bool funcRequiresImplicitArgPtr(const Function &F) {
+static bool funcRequiresHostcallPtr(const Function &F) {
// Sanitizers require the hostcall buffer passed in the implicit arguments.
return F.hasFnAttribute(Attribute::SanitizeAddress) ||
F.hasFnAttribute(Attribute::SanitizeThread) ||
@@ -140,6 +141,12 @@ public:
return ST.hasApertureRegs();
}
+ /// Check if the subtarget supports GetDoorbellID.
+ bool supportsGetDoorbellID(Function &F) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ return ST.supportsGetDoorbellID();
+ }
+
std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
return ST.getFlatWorkGroupSizes(F);
@@ -152,7 +159,7 @@ public:
}
private:
- /// Check if the ConstantExpr \p CE requires queue ptr attribute.
+ /// Check if the ConstantExpr \p CE requires the queue pointer.
static bool visitConstExpr(const ConstantExpr *CE) {
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
@@ -186,7 +193,7 @@ private:
}
public:
- /// Returns true if \p Fn needs a queue ptr attribute because of \p C.
+ /// Returns true if \p Fn needs the queue pointer because of \p C.
bool needsQueuePtr(const Constant *C, Function &Fn) {
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
bool HasAperture = hasApertureRegs(Fn);
@@ -205,7 +212,7 @@ public:
}
private:
- /// Used to determine if the Constant needs a queue ptr attribute.
+ /// Used to determine if the Constant needs the queue pointer.
DenseMap<const Constant *, uint8_t> ConstantStatus;
};
@@ -353,12 +360,15 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
// If the function requires the implicit arg pointer due to sanitizers,
// assume it's needed even if explicitly marked as not requiring it.
- const bool NeedsImplicit = funcRequiresImplicitArgPtr(*F);
- if (NeedsImplicit)
+ const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
+ if (NeedsHostcall) {
removeAssumedBits(IMPLICIT_ARG_PTR);
+ removeAssumedBits(HOSTCALL_PTR);
+ }
for (auto Attr : ImplicitAttrs) {
- if (NeedsImplicit && Attr.first == IMPLICIT_ARG_PTR)
+ if (NeedsHostcall &&
+ (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
continue;
if (F->hasFnAttribute(Attr.second))
@@ -388,9 +398,11 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
return indicatePessimisticFixpoint();
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
- auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
- bool NeedsQueuePtr = false;
+ bool NeedsImplicit = false;
+ auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+ bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
+ bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
for (Function *Callee : AAEdges.getOptimisticEdges()) {
Intrinsic::ID IID = Callee->getIntrinsicID();
@@ -403,20 +415,87 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
bool NonKernelOnly = false;
ImplicitArgumentMask AttrMask =
- intrinsicToAttrMask(IID, NonKernelOnly, NeedsQueuePtr);
+ intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
+ HasApertureRegs, SupportsGetDoorbellID);
if (AttrMask != NOT_IMPLICIT_INPUT) {
if ((IsNonEntryFunc || !NonKernelOnly))
removeAssumedBits(AttrMask);
}
}
- // If we found that we need amdgpu-queue-ptr, nothing else to do.
- if (NeedsQueuePtr) {
+ // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
+ if (NeedsImplicit)
+ removeAssumedBits(IMPLICIT_ARG_PTR);
+
+ if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
+ // Under V5, we need implicitarg_ptr + offsets to access private_base or
+ // shared_base. We do not actually need queue_ptr.
+ if (AMDGPU::getAmdhsaCodeObjectVersion() == 5)
+ removeAssumedBits(IMPLICIT_ARG_PTR);
+ else
+ removeAssumedBits(QUEUE_PTR);
+ }
+
+ if (funcRetrievesMultigridSyncArg(A)) {
+ assert(!isAssumed(IMPLICIT_ARG_PTR) &&
+ "multigrid_sync_arg needs implicitarg_ptr");
+ removeAssumedBits(MULTIGRID_SYNC_ARG);
+ }
+
+ if (funcRetrievesHostcallPtr(A)) {
+ assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
+ removeAssumedBits(HOSTCALL_PTR);
+ }
+
+ if (funcRetrievesHeapPtr(A)) {
+ assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
+ removeAssumedBits(HEAP_PTR);
+ }
+
+ if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) {
+ assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
removeAssumedBits(QUEUE_PTR);
- return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
- ChangeStatus::UNCHANGED;
}
+ return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
+ : ChangeStatus::UNCHANGED;
+ }
+
+ ChangeStatus manifest(Attributor &A) override {
+ SmallVector<Attribute, 8> AttrList;
+ LLVMContext &Ctx = getAssociatedFunction()->getContext();
+
+ for (auto Attr : ImplicitAttrs) {
+ if (isKnown(Attr.first))
+ AttrList.push_back(Attribute::get(Ctx, Attr.second));
+ }
+
+ return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
+ /* ForceReplace */ true);
+ }
+
+ const std::string getAsStr() const override {
+ std::string Str;
+ raw_string_ostream OS(Str);
+ OS << "AMDInfo[";
+ for (auto Attr : ImplicitAttrs)
+ OS << ' ' << Attr.second;
+ OS << " ]";
+ return OS.str();
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
+
+private:
+ bool checkForQueuePtr(Attributor &A) {
+ Function *F = getAssociatedFunction();
+ bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
+
+ auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+
+ bool NeedsQueuePtr = false;
+
auto CheckAddrSpaceCasts = [&](Instruction &I) {
unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
if (castRequiresQueuePtr(SrcAS)) {
@@ -431,7 +510,7 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
// `checkForAllInstructions` is much more cheaper than going through all
// instructions, try it first.
- // amdgpu-queue-ptr is not needed if aperture regs is present.
+ // The queue pointer is not needed if aperture regs is present.
if (!HasApertureRegs) {
bool UsedAssumedInformation = false;
A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
@@ -439,61 +518,79 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
UsedAssumedInformation);
}
- // If we found that we need amdgpu-queue-ptr, nothing else to do.
- if (NeedsQueuePtr) {
- removeAssumedBits(QUEUE_PTR);
- return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
- ChangeStatus::UNCHANGED;
- }
+ // If we found that we need the queue pointer, nothing else to do.
+ if (NeedsQueuePtr)
+ return true;
- if (!IsNonEntryFunc && HasApertureRegs) {
- return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
- ChangeStatus::UNCHANGED;
- }
+ if (!IsNonEntryFunc && HasApertureRegs)
+ return false;
for (BasicBlock &BB : *F) {
for (Instruction &I : BB) {
for (const Use &U : I.operands()) {
if (const auto *C = dyn_cast<Constant>(U)) {
- if (InfoCache.needsQueuePtr(C, *F)) {
- removeAssumedBits(QUEUE_PTR);
- return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
- ChangeStatus::UNCHANGED;
- }
+ if (InfoCache.needsQueuePtr(C, *F))
+ return true;
}
}
}
}
- return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
- ChangeStatus::UNCHANGED;
+ return false;
}
- ChangeStatus manifest(Attributor &A) override {
- SmallVector<Attribute, 8> AttrList;
- LLVMContext &Ctx = getAssociatedFunction()->getContext();
+ bool funcRetrievesMultigridSyncArg(Attributor &A) {
+ auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition();
+ AAPointerInfo::OffsetAndSize OAS(Pos, 8);
+ return funcRetrievesImplicitKernelArg(A, OAS);
+ }
- for (auto Attr : ImplicitAttrs) {
- if (isKnown(Attr.first))
- AttrList.push_back(Attribute::get(Ctx, Attr.second));
- }
+ bool funcRetrievesHostcallPtr(Attributor &A) {
+ auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition();
+ AAPointerInfo::OffsetAndSize OAS(Pos, 8);
+ return funcRetrievesImplicitKernelArg(A, OAS);
+ }
- return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
- /* ForceReplace */ true);
+ bool funcRetrievesHeapPtr(Attributor &A) {
+ if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
+ return false;
+ AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
+ return funcRetrievesImplicitKernelArg(A, OAS);
}
- const std::string getAsStr() const override {
- std::string Str;
- raw_string_ostream OS(Str);
- OS << "AMDInfo[";
- for (auto Attr : ImplicitAttrs)
- OS << ' ' << Attr.second;
- OS << " ]";
- return OS.str();
+ bool funcRetrievesQueuePtr(Attributor &A) {
+ if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
+ return false;
+ AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
+ return funcRetrievesImplicitKernelArg(A, OAS);
}
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {}
+ bool funcRetrievesImplicitKernelArg(Attributor &A,
+ AAPointerInfo::OffsetAndSize OAS) {
+ // Check if this is a call to the implicitarg_ptr builtin and it
+ // is used to retrieve the hostcall pointer. The implicit arg for
+ // hostcall is not used only if every use of the implicitarg_ptr
+ // is a load that clearly does not retrieve any byte of the
+ // hostcall pointer. We check this by tracing all the uses of the
+ // initial call to the implicitarg_ptr intrinsic.
+ auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
+ auto &Call = cast<CallBase>(I);
+ if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
+ return true;
+
+ const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>(
+ *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
+
+ return PointerInfoAA.forallInterferingAccesses(
+ OAS, [](const AAPointerInfo::Access &Acc, bool IsExact) {
+ return Acc.getRemoteInst()->isDroppable();
+ });
+ };
+
+ bool UsedAssumedInformation = false;
+ return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
+ UsedAssumedInformation);
+ }
};
AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
@@ -646,9 +743,14 @@ public:
AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
DenseSet<const char *> Allowed(
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
- &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID});
+ &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID, &AAPointerInfo::ID});
+
+ AttributorConfig AC(CGUpdater);
+ AC.Allowed = &Allowed;
+ AC.IsModulePass = true;
+ AC.DefaultInitializeLiveInternals = false;
- Attributor A(Functions, InfoCache, CGUpdater, &Allowed);
+ Attributor A(Functions, InfoCache, AC);
for (Function &F : M) {
if (!F.isIntrinsic()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index cd084fd5440a..fd812eb676ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#define DEBUG_TYPE "amdgpu-call-lowering"
@@ -349,7 +350,6 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
FunctionLoweringInfo &FLI) const {
MachineFunction &MF = B.getMF();
- MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MFI->setIfReturnsVoid(!Val);
@@ -365,40 +365,15 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
return true;
}
- auto const &ST = MF.getSubtarget<GCNSubtarget>();
-
- unsigned ReturnOpc = 0;
- if (IsShader)
- ReturnOpc = AMDGPU::SI_RETURN_TO_EPILOG;
- else if (CC == CallingConv::AMDGPU_Gfx)
- ReturnOpc = AMDGPU::S_SETPC_B64_return_gfx;
- else
- ReturnOpc = AMDGPU::S_SETPC_B64_return;
-
+ unsigned ReturnOpc =
+ IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
auto Ret = B.buildInstrNoInsert(ReturnOpc);
- Register ReturnAddrVReg;
- if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
- ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
- Ret.addUse(ReturnAddrVReg);
- } else if (ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) {
- ReturnAddrVReg =
- MRI.createVirtualRegister(&AMDGPU::Gfx_CCR_SGPR_64RegClass);
- Ret.addUse(ReturnAddrVReg);
- }
if (!FLI.CanLowerReturn)
insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
else if (!lowerReturnVal(B, Val, VRegs, Ret))
return false;
- if (ReturnOpc == AMDGPU::S_SETPC_B64_return ||
- ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) {
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
- &AMDGPU::SGPR_64RegClass);
- B.buildCopy(ReturnAddrVReg, LiveInReturn);
- }
-
// TODO: Handle CalleeSavedRegsViaCopy.
B.insertInstr(Ret);
@@ -479,7 +454,7 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(DispatchPtrReg);
}
- if (Info.hasQueuePtr()) {
+ if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
@@ -523,7 +498,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
const DataLayout &DL = F.getParent()->getDataLayout();
- Info->allocateModuleLDSGlobal(F.getParent());
+ Info->allocateModuleLDSGlobal(F);
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -543,9 +518,8 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
if (AllocSize == 0)
continue;
- MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None;
- if (!ABIAlign)
- ABIAlign = DL.getABITypeAlign(ArgTy);
+ MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : None;
+ Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
@@ -608,19 +582,11 @@ bool AMDGPUCallLowering::lowerFormalArguments(
const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
const DataLayout &DL = F.getParent()->getDataLayout();
- Info->allocateModuleLDSGlobal(F.getParent());
+ Info->allocateModuleLDSGlobal(F);
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
- if (!IsEntryFunc) {
- Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
- Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
- &AMDGPU::SGPR_64RegClass);
- MBB.addLiveIn(ReturnAddrReg);
- B.buildCopy(LiveInReturn, ReturnAddrReg);
- }
-
if (Info->hasImplicitBufferPtr()) {
Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 1682d43ae671..b6c66077675f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -148,53 +148,32 @@ def CSR_AMDGPU_VGPRs : CalleeSavedRegs<
(sequence "VGPR%u", 248, 255))
>;
-def CSR_AMDGPU_AGPRs_32_255 : CalleeSavedRegs<
+def CSR_AMDGPU_AGPRs : CalleeSavedRegs<
(sequence "AGPR%u", 32, 255)
>;
-def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
- (sequence "SGPR%u", 32, 105)
+def CSR_AMDGPU_SGPRs : CalleeSavedRegs<
+ (sequence "SGPR%u", 30, 105)
>;
-def CSR_AMDGPU_SI_Gfx_SGPRs_4_29 : CalleeSavedRegs<
- (sequence "SGPR%u", 4, 29)
+def CSR_AMDGPU_SI_Gfx_SGPRs : CalleeSavedRegs<
+ (add (sequence "SGPR%u", 4, 31), (sequence "SGPR%u", 64, 105))
>;
-def CSR_AMDGPU_SI_Gfx_SGPRs_64_105 : CalleeSavedRegs<
- (sequence "SGPR%u", 64, 105)
+def CSR_AMDGPU : CalleeSavedRegs<
+ (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs)
>;
-// Just to get the regmask, not for calling convention purposes.
-def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs<
- (sequence "VGPR%u", 0, 255)
->;
-
-def CSR_AMDGPU_AllAGPRs : CalleeSavedRegs<
- (sequence "AGPR%u", 0, 255)
->;
-def CSR_AMDGPU_AllVectorRegs : CalleeSavedRegs<
- (add CSR_AMDGPU_AllVGPRs, CSR_AMDGPU_AllAGPRs)
->;
-
-// Just to get the regmask, not for calling convention purposes.
-def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
- (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI)
->;
-
-def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
- (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
->;
-
-def CSR_AMDGPU_HighRegs_With_AGPRs : CalleeSavedRegs<
- (add CSR_AMDGPU_HighRegs, CSR_AMDGPU_AGPRs_32_255)
+def CSR_AMDGPU_GFX90AInsts : CalleeSavedRegs<
+ (add CSR_AMDGPU, CSR_AMDGPU_AGPRs)
>;
def CSR_AMDGPU_SI_Gfx : CalleeSavedRegs<
- (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SI_Gfx_SGPRs_4_29, CSR_AMDGPU_SI_Gfx_SGPRs_64_105)
+ (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SI_Gfx_SGPRs)
>;
-def CSR_AMDGPU_SI_Gfx_With_AGPRs : CalleeSavedRegs<
- (add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs_32_255)
+def CSR_AMDGPU_SI_Gfx_GFX90AInsts : CalleeSavedRegs<
+ (add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs)
>;
def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
@@ -233,3 +212,24 @@ def CC_AMDGPU : CallingConv<[
"AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
CCDelegateTo<CC_AMDGPU_Func>>
]>;
+
+// Trivial class to denote when a def is used only to get a RegMask, i.e.
+// SaveList is ignored and the def is not used as part of any calling
+// convention.
+class RegMask<dag mask> : CalleeSavedRegs<mask>;
+
+def AMDGPU_AllVGPRs : RegMask<
+ (sequence "VGPR%u", 0, 255)
+>;
+
+def AMDGPU_AllAGPRs : RegMask<
+ (sequence "AGPR%u", 0, 255)
+>;
+
+def AMDGPU_AllVectorRegs : RegMask<
+ (add AMDGPU_AllVGPRs, AMDGPU_AllAGPRs)
+>;
+
+def AMDGPU_AllAllocatableSRegs : RegMask<
+ (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI)
+>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 1920684d8f1f..94d7844e8a32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -877,7 +877,7 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
return getMul64(Builder, LHS, RHS).second;
}
-/// Figure out how many bits are really needed for this ddivision. \p AtLeast is
+/// Figure out how many bits are really needed for this division. \p AtLeast is
/// an optimization hint to bypass the second ComputeNumSignBits call if we the
/// first one is insufficient. Returns -1 on failure.
int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index e79ff9b597c9..c16d8ee51a7a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -373,7 +373,8 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
replaceRegWith(MRI, Dst, NegatedMatchInfo);
// Recreate non negated value for other uses of old MatchInfoDst
- Builder.setInstrAndDebugLoc(MI);
+ auto NextInst = ++MatchInfo->getIterator();
+ Builder.setInstrAndDebugLoc(*NextInst);
Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
index 04bf623bfa46..8fcf669041b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
@@ -50,7 +50,7 @@ public:
}
bool createInitOrFiniKernel(Module &M, GlobalVariable *GV, bool IsCtor) {
- if (!GV)
+ if (!GV || !GV->hasInitializer())
return false;
ConstantArray *GA = dyn_cast<ConstantArray>(GV->getInitializer());
if (!GA || GA->getNumOperands() == 0)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
index bed0707f3aa7..8236ff609f85 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
@@ -22,7 +22,7 @@ namespace {
class ExportClustering : public ScheduleDAGMutation {
public:
- ExportClustering() {}
+ ExportClustering() = default;
void apply(ScheduleDAGInstrs *DAG) override;
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
deleted file mode 100644
index ea6c6d0fd212..000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-//===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// Promote indirect (bitcast) calls to direct calls when they are statically
-/// known to be direct. Required when InstCombine is not run (e.g. at OptNone)
-/// because AMDGPU does not support indirect calls.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/CallPromotionUtils.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "amdgpu-fix-function-bitcasts"
-
-namespace {
-class AMDGPUFixFunctionBitcasts final
- : public ModulePass,
- public InstVisitor<AMDGPUFixFunctionBitcasts> {
-
- bool runOnModule(Module &M) override;
-
- bool Modified;
-
-public:
- void visitCallBase(CallBase &CB) {
- if (CB.getCalledFunction())
- return;
- auto *Callee =
- dyn_cast<Function>(CB.getCalledOperand()->stripPointerCasts());
- if (Callee && isLegalToPromote(CB, Callee)) {
- promoteCall(CB, Callee);
- Modified = true;
- }
- }
-
- static char ID;
- AMDGPUFixFunctionBitcasts() : ModulePass(ID) {}
-};
-} // End anonymous namespace
-
-char AMDGPUFixFunctionBitcasts::ID = 0;
-char &llvm::AMDGPUFixFunctionBitcastsID = AMDGPUFixFunctionBitcasts::ID;
-INITIALIZE_PASS(AMDGPUFixFunctionBitcasts, DEBUG_TYPE,
- "Fix function bitcasts for AMDGPU", false, false)
-
-ModulePass *llvm::createAMDGPUFixFunctionBitcastsPass() {
- return new AMDGPUFixFunctionBitcasts();
-}
-
-bool AMDGPUFixFunctionBitcasts::runOnModule(Module &M) {
- Modified = false;
- visit(M);
- return Modified;
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 7fd94a977be7..5747fc0ca8e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -47,10 +47,30 @@ def gi_vop3pmods :
GIComplexOperandMatcher<s32, "selectVOP3PMods">,
GIComplexPatternEquiv<VOP3PMods>;
+def gi_vop3pmodsdot :
+ GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">,
+ GIComplexPatternEquiv<VOP3PModsDOT>;
+
+def gi_dotiuvop3pmods :
+ GIComplexOperandMatcher<s32, "selectDotIUVOP3PMods">,
+ GIComplexPatternEquiv<DotIUVOP3PMods>;
+
+def gi_wmmaopselvop3pmods :
+ GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
+ GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
+
def gi_vop3opselmods :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
GIComplexPatternEquiv<VOP3OpSelMods>;
+def gi_vinterpmods :
+ GIComplexOperandMatcher<s32, "selectVINTERPMods">,
+ GIComplexPatternEquiv<VINTERPMods>;
+
+def gi_vinterpmods_hi :
+ GIComplexOperandMatcher<s32, "selectVINTERPModsHi">,
+ GIComplexPatternEquiv<VINTERPModsHi>;
+
// FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods?
def gi_vop3opsel :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
@@ -93,6 +113,10 @@ def gi_flat_scratch_saddr :
GIComplexOperandMatcher<s32, "selectScratchSAddr">,
GIComplexPatternEquiv<ScratchSAddr>;
+def gi_flat_scratch_svaddr :
+ GIComplexOperandMatcher<s32, "selectScratchSVAddr">,
+ GIComplexPatternEquiv<ScratchSVAddr>;
+
def gi_ds_1addr_1offset :
GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">,
GIComplexPatternEquiv<DS1Addr1Offset>;
@@ -123,7 +147,7 @@ def gi_smrd_buffer_imm32 :
// Separate load nodes are defined to glue m0 initialization in
// SelectionDAG. The GISel selector can just insert m0 initialization
-// directly before before selecting a glue-less load, so hide this
+// directly before selecting a glue-less load, so hide this
// distinction.
def : GINodeEquiv<G_LOAD, AMDGPUld_glue> {
@@ -222,6 +246,9 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
+def : GINodeEquiv<G_FPTRUNC_ROUND_UPWARD, SIfptrunc_round_upward>;
+def : GINodeEquiv<G_FPTRUNC_ROUND_DOWNWARD, SIfptrunc_round_downward>;
+
class GISelSop2Pat <
SDPatternOperator node,
Instruction inst,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index cabdc6998011..1bbdc39a7a5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -7,8 +7,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUGlobalISelUtils.h"
+#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/IR/Constants.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
using namespace llvm;
using namespace MIPatternMatch;
@@ -66,3 +68,12 @@ bool AMDGPU::isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
return true;
return (Mask[0] & 2) == (Mask[1] & 2);
}
+
+bool AMDGPU::hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget,
+ const LLT &Ty) {
+ if (Ty == LLT::scalar(32))
+ return Subtarget.hasAtomicFaddRtnInsts();
+ if (Ty == LLT::fixed_vector(2, 16) || Ty == LLT::scalar(64))
+ return Subtarget.hasGFX90AInsts();
+ return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 14d3a3fb7997..5c600d059b7a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -16,6 +16,8 @@
namespace llvm {
class MachineRegisterInfo;
+class GCNSubtarget;
+class LLT;
namespace AMDGPU {
@@ -24,7 +26,7 @@ std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg);
bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask);
-
+bool hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, const LLT &Ty);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index f5018e3a19ac..6fa44ffcbfaa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -400,17 +400,15 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func,
auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
AMDGPUAS::GLOBAL_ADDRESS);
- // Emit "printf buffer" argument if printf is used, otherwise emit dummy
- // "none" argument.
if (HiddenArgNumBytes >= 32) {
+ // We forbid the use of features requiring hostcall when compiling OpenCL
+ // before code object V5, which makes the mutual exclusion between the
+ // "printf buffer" and "hostcall buffer" here sound.
if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenPrintfBuffer);
- else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) {
- // The printf runtime binding pass should have ensured that hostcall and
- // printf are not used in the same module.
- assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts"));
+ else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr"))
emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenHostcallBuffer);
- } else
+ else
emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
}
@@ -427,8 +425,12 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func,
}
// Emit the pointer argument for multi-grid object.
- if (HiddenArgNumBytes >= 56)
- emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenMultiGridSyncArg);
+ if (HiddenArgNumBytes >= 56) {
+ if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg"))
+ emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenMultiGridSyncArg);
+ else
+ emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
+ }
}
bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
@@ -803,6 +805,8 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const MachineFunction &MF,
auto &DL = M->getDataLayout();
auto Int64Ty = Type::getInt64Ty(Func.getContext());
+ Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr());
+
if (HiddenArgNumBytes >= 8)
emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
Args);
@@ -816,19 +820,17 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const MachineFunction &MF,
auto Int8PtrTy =
Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
- // Emit "printf buffer" argument if printf is used, emit "hostcall buffer"
- // if "hostcall" module flag is set, otherwise emit dummy "none" argument.
if (HiddenArgNumBytes >= 32) {
+ // We forbid the use of features requiring hostcall when compiling OpenCL
+ // before code object V5, which makes the mutual exclusion between the
+ // "printf buffer" and "hostcall buffer" here sound.
if (M->getNamedMetadata("llvm.printf.fmts"))
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
Args);
- else if (M->getModuleFlag("amdgpu_hostcall")) {
- // The printf runtime binding pass should have ensured that hostcall and
- // printf are not used in the same module.
- assert(!M->getNamedMetadata("llvm.printf.fmts"));
+ else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr"))
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
Args);
- } else
+ else
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
}
@@ -847,9 +849,14 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const MachineFunction &MF,
}
// Emit the pointer argument for multi-grid object.
- if (HiddenArgNumBytes >= 56)
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
- Args);
+ if (HiddenArgNumBytes >= 56) {
+ if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) {
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
+ Args);
+ } else {
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
+ }
+ }
}
msgpack::MapDocNode
@@ -876,6 +883,12 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
Kern.getDocument()->getNode(STM.getWavefrontSize());
Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR);
+
+ // Only add AGPR count to metadata for supported devices
+ if (STM.hasMAIInsts()) {
+ Kern[".agpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumAccVGPR);
+ }
+
Kern[".max_flat_workgroup_size"] =
Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
Kern[".sgpr_spill_count"] =
@@ -971,13 +984,20 @@ void MetadataStreamerV5::emitHiddenKernelArgs(const MachineFunction &MF,
msgpack::ArrayDocNode Args) {
auto &Func = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+ // No implicit kernel argument is used.
+ if (ST.getImplicitArgNumBytes(Func) == 0)
+ return;
+
const Module *M = Func.getParent();
auto &DL = M->getDataLayout();
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
auto Int64Ty = Type::getInt64Ty(Func.getContext());
auto Int32Ty = Type::getInt32Ty(Func.getContext());
auto Int16Ty = Type::getInt16Ty(Func.getContext());
+ Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr());
emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset, Args);
emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset, Args);
emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset, Args);
@@ -1008,40 +1028,49 @@ void MetadataStreamerV5::emitHiddenKernelArgs(const MachineFunction &MF,
if (M->getNamedMetadata("llvm.printf.fmts")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
Args);
- } else
+ } else {
Offset += 8; // Skipped.
+ }
- if (M->getModuleFlag("amdgpu_hostcall")) {
+ if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
Args);
- } else
+ } else {
Offset += 8; // Skipped.
+ }
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
+ if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) {
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
Args);
+ } else {
+ Offset += 8; // Skipped.
+ }
- // Ignore temporarily until it is implemented.
- // emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
- Offset += 8;
+ if (!Func.hasFnAttribute("amdgpu-no-heap-ptr"))
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
+ else
+ Offset += 8; // Skipped.
if (Func.hasFnAttribute("calls-enqueue-kernel")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
Args);
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
Args);
- } else
+ } else {
Offset += 16; // Skipped.
+ }
Offset += 72; // Reserved.
- // hidden_private_base and hidden_shared_base are only used by GFX8.
- if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // hidden_private_base and hidden_shared_base are only when the subtarget has
+ // ApertureRegs.
+ if (!ST.hasApertureRegs()) {
emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args);
emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args);
- } else
+ } else {
Offset += 8; // Skipped.
+ }
- const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
if (MFI.hasQueuePtr())
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index bcf7fc449094..9b22d1f4d1b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -42,7 +42,7 @@ namespace HSAMD {
class MetadataStreamer {
public:
- virtual ~MetadataStreamer(){};
+ virtual ~MetadataStreamer() = default;
virtual bool emitTo(AMDGPUTargetStreamer &TargetStreamer) = 0;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
new file mode 100644
index 000000000000..5c507ef70a8c
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -0,0 +1,439 @@
+//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file This file defines a set of schedule DAG mutations that can be used to
+// override default scheduler behavior to enforce specific scheduling patterns.
+// They should be used in cases where runtime performance considerations such as
+// inter-wavefront interactions, mean that compile-time heuristics cannot
+// predict the optimal instruction ordering, or in kernels where optimum
+// instruction scheduling is important enough to warrant manual intervention.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUIGroupLP.h"
+#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+namespace {
+
+static cl::opt<bool>
+ EnableIGroupLP("amdgpu-igrouplp",
+ cl::desc("Enable construction of Instruction Groups and "
+ "their ordering for scheduling"),
+ cl::init(false));
+
+static cl::opt<Optional<unsigned>>
+ VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None),
+ cl::Hidden,
+ cl::desc("The maximum number of instructions to include "
+ "in VMEM group."));
+
+static cl::opt<Optional<unsigned>>
+ MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None),
+ cl::Hidden,
+ cl::desc("The maximum number of instructions to include "
+ "in MFMA group."));
+
+static cl::opt<Optional<unsigned>>
+ LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None),
+ cl::Hidden,
+ cl::desc("The maximum number of instructions to include "
+ "in lds/gds read group."));
+
+static cl::opt<Optional<unsigned>>
+ LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None),
+ cl::Hidden,
+ cl::desc("The maximum number of instructions to include "
+ "in lds/gds write group."));
+
+typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)>
+ CanAddMIFn;
+
+// Classify instructions into groups to enable fine tuned control over the
+// scheduler. These groups may be more specific than current SchedModel
+// instruction classes.
+class SchedGroup {
+private:
+ // Function that returns true if a non-bundle MI may be inserted into this
+ // group.
+ const CanAddMIFn canAddMI;
+
+ // Maximum number of SUnits that can be added to this group.
+ Optional<unsigned> MaxSize;
+
+ // Collection of SUnits that are classified as members of this group.
+ SmallVector<SUnit *, 32> Collection;
+
+ ScheduleDAGInstrs *DAG;
+
+ void tryAddEdge(SUnit *A, SUnit *B) {
+ if (A != B && DAG->canAddEdge(B, A)) {
+ DAG->addEdge(B, SDep(A, SDep::Artificial));
+ LLVM_DEBUG(dbgs() << "Adding edge...\n"
+ << "from: SU(" << A->NodeNum << ") " << *A->getInstr()
+ << "to: SU(" << B->NodeNum << ") " << *B->getInstr());
+ }
+ }
+
+public:
+ // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
+ // MakePred is true, SU will be a predecessor of the SUnits in this
+ // SchedGroup, otherwise SU will be a successor.
+ void link(SUnit &SU, bool MakePred = false) {
+ for (auto A : Collection) {
+ SUnit *B = &SU;
+ if (MakePred)
+ std::swap(A, B);
+
+ tryAddEdge(A, B);
+ }
+ }
+
+ // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use
+ // the predicate to determine whether SU should be a predecessor (P = true)
+ // or a successor (P = false) of this SchedGroup.
+ void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P) {
+ for (auto A : Collection) {
+ SUnit *B = &SU;
+ if (P(A, B))
+ std::swap(A, B);
+
+ tryAddEdge(A, B);
+ }
+ }
+
+ // Add DAG dependencies such that SUnits in this group shall be ordered
+ // before SUnits in OtherGroup.
+ void link(SchedGroup &OtherGroup) {
+ for (auto B : OtherGroup.Collection)
+ link(*B);
+ }
+
+ // Returns true if no more instructions may be added to this group.
+ bool isFull() { return MaxSize && Collection.size() >= *MaxSize; }
+
+ // Returns true if SU can be added to this SchedGroup.
+ bool canAddSU(SUnit &SU, const SIInstrInfo *TII) {
+ if (isFull())
+ return false;
+
+ MachineInstr &MI = *SU.getInstr();
+ if (MI.getOpcode() != TargetOpcode::BUNDLE)
+ return canAddMI(MI, TII);
+
+ // Special case for bundled MIs.
+ const MachineBasicBlock *MBB = MI.getParent();
+ MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
+ while (E != MBB->end() && E->isBundledWithPred())
+ ++E;
+
+ // Return true if all of the bundled MIs can be added to this group.
+ return std::all_of(
+ B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); });
+ }
+
+ void add(SUnit &SU) { Collection.push_back(&SU); }
+
+ SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize,
+ ScheduleDAGInstrs *DAG)
+ : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {}
+};
+
+bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return TII->isMFMA(MI);
+}
+
+bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return TII->isVALU(MI) && !TII->isMFMA(MI);
+}
+
+bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return TII->isSALU(MI);
+}
+
+bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
+}
+
+bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return MI.mayLoad() &&
+ (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
+}
+
+bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return MI.mayStore() &&
+ (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
+}
+
+bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return MI.mayStore() && TII->isDS(MI);
+}
+
+bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return MI.mayLoad() && TII->isDS(MI);
+}
+
+class IGroupLPDAGMutation : public ScheduleDAGMutation {
+public:
+ const SIInstrInfo *TII;
+ ScheduleDAGMI *DAG;
+
+ IGroupLPDAGMutation() = default;
+ void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+// DAG mutation that coordinates with the SCHED_BARRIER instruction and
+// corresponding builtin. The mutation adds edges from specific instruction
+// classes determined by the SCHED_BARRIER mask so that they cannot be
+// scheduled around the SCHED_BARRIER.
+class SchedBarrierDAGMutation : public ScheduleDAGMutation {
+private:
+ const SIInstrInfo *TII;
+
+ ScheduleDAGMI *DAG;
+
+ // Components of the mask that determines which instructions may not be
+ // scheduled across the SCHED_BARRIER.
+ enum class SchedBarrierMasks {
+ NONE = 0u,
+ ALU = 1u << 0,
+ VALU = 1u << 1,
+ SALU = 1u << 2,
+ MFMA = 1u << 3,
+ VMEM = 1u << 4,
+ VMEM_READ = 1u << 5,
+ VMEM_WRITE = 1u << 6,
+ DS = 1u << 7,
+ DS_READ = 1u << 8,
+ DS_WRITE = 1u << 9,
+ LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE)
+ };
+
+ // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a
+ // region.
+ //
+ std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr;
+ std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr;
+ std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr;
+ std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr;
+ std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr;
+ std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr;
+ std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr;
+
+ // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
+ // not be reordered accross the SCHED_BARRIER.
+ void getSchedGroupsFromMask(int32_t Mask,
+ SmallVectorImpl<SchedGroup *> &SchedGroups);
+
+ // Add DAG edges that enforce SCHED_BARRIER ordering.
+ void addSchedBarrierEdges(SUnit &SU);
+
+ // Classify instructions and add them to the SchedGroup.
+ void initSchedGroup(SchedGroup *SG);
+
+ // Remove all existing edges from a SCHED_BARRIER.
+ void resetSchedBarrierEdges(SUnit &SU);
+
+public:
+ void apply(ScheduleDAGInstrs *DAGInstrs) override;
+
+ SchedBarrierDAGMutation() = default;
+};
+
+void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+ const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
+ const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
+ if (!TSchedModel || DAG->SUnits.empty())
+ return;
+
+ LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
+
+ // The order of InstructionGroups in this vector defines the
+ // order in which edges will be added. In other words, given the
+ // present ordering, we will try to make each VMEMRead instruction
+ // a predecessor of each DSRead instruction, and so on.
+ SmallVector<SchedGroup, 4> PipelineOrderGroups = {
+ SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG),
+ SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG),
+ SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG),
+ SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)};
+
+ for (SUnit &SU : DAG->SUnits) {
+ LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
+ for (auto &SG : PipelineOrderGroups)
+ if (SG.canAddSU(SU, TII))
+ SG.add(SU);
+ }
+
+ for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) {
+ auto &GroupA = PipelineOrderGroups[i];
+ for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) {
+ auto &GroupB = PipelineOrderGroups[j];
+ GroupA.link(GroupB);
+ }
+ }
+}
+
+void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+ const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
+ if (!TSchedModel || DAGInstrs->SUnits.empty())
+ return;
+
+ LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n");
+
+ const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
+ for (auto &SU : DAG->SUnits)
+ if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER)
+ addSchedBarrierEdges(SU);
+}
+
+void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
+ MachineInstr &MI = *SchedBarrier.getInstr();
+ assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
+ // Remove all existing edges from the SCHED_BARRIER that were added due to the
+ // instruction having side effects.
+ resetSchedBarrierEdges(SchedBarrier);
+ SmallVector<SchedGroup *, 4> SchedGroups;
+ int32_t Mask = MI.getOperand(0).getImm();
+ getSchedGroupsFromMask(Mask, SchedGroups);
+ for (auto SG : SchedGroups)
+ SG->link(
+ SchedBarrier, (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
+ const SUnit *A, const SUnit *B) {
+ return A->NodeNum > B->NodeNum;
+ });
+}
+
+void SchedBarrierDAGMutation::getSchedGroupsFromMask(
+ int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) {
+ SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask;
+ // See IntrinsicsAMDGPU.td for an explanation of these masks and their
+ // mappings.
+ //
+ if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
+ if (!VALUSchedGroup) {
+ VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG);
+ initSchedGroup(VALUSchedGroup.get());
+ }
+
+ SchedGroups.push_back(VALUSchedGroup.get());
+ }
+
+ if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
+ if (!SALUSchedGroup) {
+ SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG);
+ initSchedGroup(SALUSchedGroup.get());
+ }
+
+ SchedGroups.push_back(SALUSchedGroup.get());
+ }
+
+ if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
+ if (!MFMASchedGroup) {
+ MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG);
+ initSchedGroup(MFMASchedGroup.get());
+ }
+
+ SchedGroups.push_back(MFMASchedGroup.get());
+ }
+
+ if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
+ if (!VMEMReadSchedGroup) {
+ VMEMReadSchedGroup =
+ std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG);
+ initSchedGroup(VMEMReadSchedGroup.get());
+ }
+
+ SchedGroups.push_back(VMEMReadSchedGroup.get());
+ }
+
+ if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
+ if (!VMEMWriteSchedGroup) {
+ VMEMWriteSchedGroup =
+ std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG);
+ initSchedGroup(VMEMWriteSchedGroup.get());
+ }
+
+ SchedGroups.push_back(VMEMWriteSchedGroup.get());
+ }
+
+ if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
+ if (!DSReadSchedGroup) {
+ DSReadSchedGroup =
+ std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG);
+ initSchedGroup(DSReadSchedGroup.get());
+ }
+
+ SchedGroups.push_back(DSReadSchedGroup.get());
+ }
+
+ if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
+ if (!DSWriteSchedGroup) {
+ DSWriteSchedGroup =
+ std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG);
+ initSchedGroup(DSWriteSchedGroup.get());
+ }
+
+ SchedGroups.push_back(DSWriteSchedGroup.get());
+ }
+}
+
+void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) {
+ assert(SG);
+ for (auto &SU : DAG->SUnits)
+ if (SG->canAddSU(SU, TII))
+ SG->add(SU);
+}
+
+void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) {
+ assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER);
+ for (auto &P : SU.Preds)
+ SU.removePred(P);
+
+ for (auto &S : SU.Succs) {
+ for (auto &SP : S.getSUnit()->Preds) {
+ if (SP.getSUnit() == &SU) {
+ S.getSUnit()->removePred(SP);
+ }
+ }
+ }
+}
+
+} // namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
+ return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr;
+}
+
+std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() {
+ return std::make_unique<SchedBarrierDAGMutation>();
+}
+
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
new file mode 100644
index 000000000000..aeb1bbad3705
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -0,0 +1,22 @@
+//===- AMDGPUMFMAIGroupLP.h - AMDGPU MFMA IGroupLP --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
+
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation();
+std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8236e6672247..b00df27f5fd3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -13,7 +13,9 @@
#include "AMDGPUISelDAGToDAG.h"
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/R600MCTargetDesc.h"
#include "R600RegisterInfo.h"
#include "SIMachineFunctionInfo.h"
@@ -679,9 +681,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::FMA:
SelectFMAD_FMA(N);
return;
- case AMDGPUISD::ATOMIC_CMP_SWAP:
- SelectATOMIC_CMP_SWAP(N);
- return;
case AMDGPUISD::CVT_PKRTZ_F16_F32:
case AMDGPUISD::CVT_PKNORM_I16_F32:
case AMDGPUISD::CVT_PKNORM_U16_F32:
@@ -1008,7 +1007,12 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
SDLoc SL(N);
bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
- unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
+ unsigned Opc;
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11)
+ Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
+ : AMDGPU::V_MAD_U64_U32_gfx11_e64;
+ else
+ Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
@@ -1021,7 +1025,12 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
SDLoc SL(N);
bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
- unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
+ unsigned Opc;
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11)
+ Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
+ : AMDGPU::V_MAD_U64_U32_gfx11_e64;
+ else
+ Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
@@ -1798,6 +1807,82 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
return true;
}
+// Check whether the flat scratch SVS swizzle bug affects this access.
+bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
+ SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
+ if (!Subtarget->hasFlatScratchSVSSwizzleBug())
+ return false;
+
+ // The bug affects the swizzling of SVS accesses if there is any carry out
+ // from the two low order bits (i.e. from bit 1 into bit 2) when adding
+ // voffset to (soffset + inst_offset).
+ KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
+ KnownBits SKnown = KnownBits::computeForAddSub(
+ true, false, CurDAG->computeKnownBits(SAddr),
+ KnownBits::makeConstant(APInt(32, ImmOffset)));
+ uint64_t VMax = VKnown.getMaxValue().getZExtValue();
+ uint64_t SMax = SKnown.getMaxValue().getZExtValue();
+ return (VMax & 3) + (SMax & 3) >= 4;
+}
+
+bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
+ SDValue &VAddr, SDValue &SAddr,
+ SDValue &Offset) const {
+ int64_t ImmOffset = 0;
+
+ SDValue LHS, RHS;
+ if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
+ int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+
+ if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+ Addr = LHS;
+ ImmOffset = COffsetVal;
+ } else if (!LHS->isDivergent() && COffsetVal > 0) {
+ SDLoc SL(N);
+ // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
+ // (large_offset & MaxOffset);
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset)
+ = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
+
+ if (isUInt<32>(RemainderOffset)) {
+ SDNode *VMov = CurDAG->getMachineNode(
+ AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
+ CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
+ VAddr = SDValue(VMov, 0);
+ SAddr = LHS;
+ if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
+ return false;
+ Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
+ return true;
+ }
+ }
+ }
+
+ if (Addr.getOpcode() != ISD::ADD)
+ return false;
+
+ LHS = Addr.getOperand(0);
+ RHS = Addr.getOperand(1);
+
+ if (!LHS->isDivergent() && RHS->isDivergent()) {
+ SAddr = LHS;
+ VAddr = RHS;
+ } else if (!RHS->isDivergent() && LHS->isDivergent()) {
+ SAddr = RHS;
+ VAddr = LHS;
+ } else {
+ return false;
+ }
+
+ if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
+ return false;
+ SAddr = SelectSAddrFI(CurDAG, SAddr);
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
SDValue &Offset, bool &Imm) const {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
@@ -2224,70 +2309,6 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
}
}
-// This is here because there isn't a way to use the generated sub0_sub1 as the
-// subreg index to EXTRACT_SUBREG in tablegen.
-void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
- MemSDNode *Mem = cast<MemSDNode>(N);
- unsigned AS = Mem->getAddressSpace();
- if (AS == AMDGPUAS::FLAT_ADDRESS) {
- SelectCode(N);
- return;
- }
-
- MVT VT = N->getSimpleValueType(0);
- bool Is32 = (VT == MVT::i32);
- SDLoc SL(N);
-
- MachineSDNode *CmpSwap = nullptr;
- if (Subtarget->hasAddr64()) {
- SDValue SRsrc, VAddr, SOffset, Offset;
-
- if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset)) {
- unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
- AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
- SDValue CmpVal = Mem->getOperand(2);
- SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
-
- // XXX - Do we care about glue operands?
-
- SDValue Ops[] = {CmpVal, VAddr, SRsrc, SOffset, Offset, CPol,
- Mem->getChain()};
-
- CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
- }
- }
-
- if (!CmpSwap) {
- SDValue SRsrc, SOffset, Offset;
- if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset)) {
- unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
- AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
-
- SDValue CmpVal = Mem->getOperand(2);
- SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
- SDValue Ops[] = {CmpVal, SRsrc, SOffset, Offset, CPol, Mem->getChain()};
-
- CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
- }
- }
-
- if (!CmpSwap) {
- SelectCode(N);
- return;
- }
-
- MachineMemOperand *MMO = Mem->getMemOperand();
- CurDAG->setNodeMemRefs(CmpSwap, {MMO});
-
- unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
- SDValue Extract
- = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));
-
- ReplaceUses(SDValue(N, 0), Extract);
- ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
- CurDAG->RemoveDeadNode(N);
-}
-
void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
// The address is assumed to be uniform, so if it ends up in a VGPR, it will
// be copied to an SGPR with readfirstlane.
@@ -2587,6 +2608,30 @@ bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
return true;
}
+bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
+ SDValue &SrcMods,
+ bool OpSel) const {
+ unsigned Mods;
+ if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
+ if (OpSel)
+ Mods |= SISrcMods::OP_SEL_0;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+ }
+
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
SDValue &SrcMods, SDValue &Clamp,
SDValue &Omod) const {
@@ -2619,7 +2664,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
}
bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
- SDValue &SrcMods) const {
+ SDValue &SrcMods, bool IsDOT) const {
unsigned Mods = 0;
Src = In;
@@ -2628,7 +2673,8 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
Src = Src.getOperand(0);
}
- if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+ if (Src.getOpcode() == ISD::BUILD_VECTOR &&
+ (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
unsigned VecMods = Mods;
SDValue Lo = stripBitcast(Src.getOperand(0));
@@ -2716,6 +2762,40 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ return SelectVOP3PMods(In, Src, SrcMods, true);
+}
+
+bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const {
+ const ConstantSDNode *C = cast<ConstantSDNode>(In);
+ // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
+ // 1 promotes packed values to signed, 0 treats them as unsigned.
+ assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
+
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned SrcSign = C->getAPIntValue().getZExtValue();
+ if (SrcSign == 1)
+ Mods ^= SISrcMods::NEG;
+
+ Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
+ SDValue &Src) const {
+ const ConstantSDNode *C = cast<ConstantSDNode>(In);
+ assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
+
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned SrcVal = C->getAPIntValue().getZExtValue();
+ if (SrcVal == 1)
+ Mods |= SISrcMods::OP_SEL_0;
+
+ Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
@@ -2840,7 +2920,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
}
}
}
- // If "AllUsesAcceptSReg == false" so far we haven't suceeded
+ // If "AllUsesAcceptSReg == false" so far we haven't succeeded
// commuting current user. This means have at least one use
// that strictly require VGPR. Thus, we will not attempt to commute
// other user instructions.
@@ -2854,26 +2934,15 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
auto Ld = cast<LoadSDNode>(N);
- return Ld->getAlignment() >= 4 &&
- (
- (
- (
- Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
- Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
- )
- &&
- !N->isDivergent()
- )
- ||
- (
- Subtarget->getScalarizeGlobalBehavior() &&
- Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
- Ld->isSimple() &&
- !N->isDivergent() &&
- static_cast<const SITargetLowering *>(
- getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
- )
- );
+ return Ld->getAlign() >= Align(4) &&
+ (((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
+ !N->isDivergent()) ||
+ (Subtarget->getScalarizeGlobalBehavior() &&
+ Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+ Ld->isSimple() && !N->isDivergent() &&
+ static_cast<const SITargetLowering *>(getTargetLowering())
+ ->isMemOpHasNoClobberedMemOperand(N)));
}
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index d638d9877a9b..862be9dc5568 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -188,6 +188,10 @@ private:
SDValue &VOffset, SDValue &Offset) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &Offset) const;
+ bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
+ uint64_t ImmOffset) const;
+ bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
+ SDValue &SAddr, SDValue &Offset) const;
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
bool &Imm) const;
@@ -214,10 +218,20 @@ private:
bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
SDValue &Clamp, SDValue &Omod) const;
+ bool SelectVINTERPModsImpl(SDValue In, SDValue &Src, SDValue &SrcMods,
+ bool OpSel) const;
+ bool SelectVINTERPMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVINTERPModsHi(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+
bool SelectVOP3OMods(SDValue In, SDValue &Src, SDValue &Clamp,
SDValue &Omod) const;
- bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods,
+ bool IsDOT = false) const;
+ bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+
+ bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const;
+ bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
@@ -245,7 +259,6 @@ private:
bool isCBranchSCC(const SDNode *N) const;
void SelectBRCOND(SDNode *N);
void SelectFMAD_FMA(SDNode *N);
- void SelectATOMIC_CMP_SWAP(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
void SelectInterpP1F16(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index b9d0655feef7..ef7929012597 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -19,6 +19,7 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Support/CommandLine.h"
@@ -127,49 +128,27 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// There are no 64-bit extloads. These should be done as a 32-bit extload and
// an extension to 64-bit.
- for (MVT VT : MVT::integer_valuetypes()) {
- setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
- }
+ for (MVT VT : MVT::integer_valuetypes())
+ setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,
+ Expand);
for (MVT VT : MVT::integer_valuetypes()) {
if (VT == MVT::i64)
continue;
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
-
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
-
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
+ for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
+ setLoadExtAction(Op, VT, MVT::i1, Promote);
+ setLoadExtAction(Op, VT, MVT::i8, Legal);
+ setLoadExtAction(Op, VT, MVT::i16, Legal);
+ setLoadExtAction(Op, VT, MVT::i32, Expand);
+ }
}
- for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
- }
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
+ for (auto MemVT :
+ {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
+ setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,
+ Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
@@ -304,229 +283,125 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
- setOperationAction(ISD::Constant, MVT::i32, Legal);
- setOperationAction(ISD::Constant, MVT::i64, Legal);
- setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
- setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+ setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
+ setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
- setOperationAction(ISD::BR_JT, MVT::Other, Expand);
- setOperationAction(ISD::BRIND, MVT::Other, Expand);
+ setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
// This is totally unsupported, just custom lower to produce an error.
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
// Library functions. These default to Expand, but we have instructions
// for them.
- setOperationAction(ISD::FCEIL, MVT::f32, Legal);
- setOperationAction(ISD::FEXP2, MVT::f32, Legal);
- setOperationAction(ISD::FPOW, MVT::f32, Legal);
- setOperationAction(ISD::FLOG2, MVT::f32, Legal);
- setOperationAction(ISD::FABS, MVT::f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
- setOperationAction(ISD::FRINT, MVT::f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
- setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+ setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FLOG2, ISD::FABS,
+ ISD::FFLOOR, ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM,
+ ISD::FMAXNUM},
+ MVT::f32, Legal);
- setOperationAction(ISD::FROUND, MVT::f32, Custom);
- setOperationAction(ISD::FROUND, MVT::f64, Custom);
+ setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
- setOperationAction(ISD::FLOG, MVT::f32, Custom);
- setOperationAction(ISD::FLOG10, MVT::f32, Custom);
- setOperationAction(ISD::FEXP, MVT::f32, Custom);
+ setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom);
+ setOperationAction(ISD::FNEARBYINT, {MVT::f32, MVT::f64}, Custom);
- setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
- setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
-
- setOperationAction(ISD::FREM, MVT::f16, Custom);
- setOperationAction(ISD::FREM, MVT::f32, Custom);
- setOperationAction(ISD::FREM, MVT::f64, Custom);
+ setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
// Expand to fneg + fadd.
setOperationAction(ISD::FSUB, MVT::f64, Expand);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f16, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS,
+ {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
+ MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
+ MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32},
+ Custom);
+ setOperationAction(
+ ISD::EXTRACT_SUBVECTOR,
+ {MVT::v2f16, MVT::v2i16, MVT::v4f16, MVT::v4i16, MVT::v2f32,
+ MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, MVT::v4i32,
+ MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, MVT::v7f32,
+ MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v16f16, MVT::v16i16,
+ MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64,
+ MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, MVT::v4i64,
+ MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
+ Custom);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
- setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
- setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
+ setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
for (MVT VT : ScalarIntVTs) {
// These should use [SU]DIVREM, so set them to expand
- setOperationAction(ISD::SDIV, VT, Expand);
- setOperationAction(ISD::UDIV, VT, Expand);
- setOperationAction(ISD::SREM, VT, Expand);
- setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
+ Expand);
// GPU does not have divrem function for signed or unsigned.
- setOperationAction(ISD::SDIVREM, VT, Custom);
- setOperationAction(ISD::UDIVREM, VT, Custom);
+ setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);
// GPU does not have [S|U]MUL_LOHI functions as a single instruction.
- setOperationAction(ISD::SMUL_LOHI, VT, Expand);
- setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
- setOperationAction(ISD::BSWAP, VT, Expand);
- setOperationAction(ISD::CTTZ, VT, Expand);
- setOperationAction(ISD::CTLZ, VT, Expand);
+ setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);
// AMDGPU uses ADDC/SUBC/ADDE/SUBE
- setOperationAction(ISD::ADDC, VT, Legal);
- setOperationAction(ISD::SUBC, VT, Legal);
- setOperationAction(ISD::ADDE, VT, Legal);
- setOperationAction(ISD::SUBE, VT, Legal);
+ setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
}
// The hardware supports 32-bit FSHR, but not FSHL.
setOperationAction(ISD::FSHR, MVT::i32, Legal);
// The hardware supports 32-bit ROTR, but not ROTL.
- setOperationAction(ISD::ROTL, MVT::i32, Expand);
- setOperationAction(ISD::ROTL, MVT::i64, Expand);
+ setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
setOperationAction(ISD::ROTR, MVT::i64, Expand);
- setOperationAction(ISD::MULHU, MVT::i16, Expand);
- setOperationAction(ISD::MULHS, MVT::i16, Expand);
+ setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
- setOperationAction(ISD::MUL, MVT::i64, Expand);
- setOperationAction(ISD::MULHU, MVT::i64, Expand);
- setOperationAction(ISD::MULHS, MVT::i64, Expand);
- setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+ setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
+ setOperationAction(
+ {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
+ MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
- setOperationAction(ISD::SMIN, MVT::i32, Legal);
- setOperationAction(ISD::UMIN, MVT::i32, Legal);
- setOperationAction(ISD::SMAX, MVT::i32, Legal);
- setOperationAction(ISD::UMAX, MVT::i32, Legal);
+ setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
+ Legal);
- setOperationAction(ISD::CTTZ, MVT::i64, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
- setOperationAction(ISD::CTLZ, MVT::i64, Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+ setOperationAction(
+ {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
+ MVT::i64, Custom);
static const MVT::SimpleValueType VectorIntTypes[] = {
MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
for (MVT VT : VectorIntTypes) {
// Expand the following operations for the current type by default.
- setOperationAction(ISD::ADD, VT, Expand);
- setOperationAction(ISD::AND, VT, Expand);
- setOperationAction(ISD::FP_TO_SINT, VT, Expand);
- setOperationAction(ISD::FP_TO_UINT, VT, Expand);
- setOperationAction(ISD::MUL, VT, Expand);
- setOperationAction(ISD::MULHU, VT, Expand);
- setOperationAction(ISD::MULHS, VT, Expand);
- setOperationAction(ISD::OR, VT, Expand);
- setOperationAction(ISD::SHL, VT, Expand);
- setOperationAction(ISD::SRA, VT, Expand);
- setOperationAction(ISD::SRL, VT, Expand);
- setOperationAction(ISD::ROTL, VT, Expand);
- setOperationAction(ISD::ROTR, VT, Expand);
- setOperationAction(ISD::SUB, VT, Expand);
- setOperationAction(ISD::SINT_TO_FP, VT, Expand);
- setOperationAction(ISD::UINT_TO_FP, VT, Expand);
- setOperationAction(ISD::SDIV, VT, Expand);
- setOperationAction(ISD::UDIV, VT, Expand);
- setOperationAction(ISD::SREM, VT, Expand);
- setOperationAction(ISD::UREM, VT, Expand);
- setOperationAction(ISD::SMUL_LOHI, VT, Expand);
- setOperationAction(ISD::UMUL_LOHI, VT, Expand);
- setOperationAction(ISD::SDIVREM, VT, Expand);
- setOperationAction(ISD::UDIVREM, VT, Expand);
- setOperationAction(ISD::SELECT, VT, Expand);
- setOperationAction(ISD::VSELECT, VT, Expand);
- setOperationAction(ISD::SELECT_CC, VT, Expand);
- setOperationAction(ISD::XOR, VT, Expand);
- setOperationAction(ISD::BSWAP, VT, Expand);
- setOperationAction(ISD::CTPOP, VT, Expand);
- setOperationAction(ISD::CTTZ, VT, Expand);
- setOperationAction(ISD::CTLZ, VT, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
- setOperationAction(ISD::SETCC, VT, Expand);
+ setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT,
+ ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU,
+ ISD::MULHS, ISD::OR, ISD::SHL,
+ ISD::SRA, ISD::SRL, ISD::ROTL,
+ ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP,
+ ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV,
+ ISD::SREM, ISD::UREM, ISD::SMUL_LOHI,
+ ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM,
+ ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC,
+ ISD::XOR, ISD::BSWAP, ISD::CTPOP,
+ ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE,
+ ISD::SETCC},
+ VT, Expand);
}
static const MVT::SimpleValueType FloatVectorTypes[] = {
MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
for (MVT VT : FloatVectorTypes) {
- setOperationAction(ISD::FABS, VT, Expand);
- setOperationAction(ISD::FMINNUM, VT, Expand);
- setOperationAction(ISD::FMAXNUM, VT, Expand);
- setOperationAction(ISD::FADD, VT, Expand);
- setOperationAction(ISD::FCEIL, VT, Expand);
- setOperationAction(ISD::FCOS, VT, Expand);
- setOperationAction(ISD::FDIV, VT, Expand);
- setOperationAction(ISD::FEXP2, VT, Expand);
- setOperationAction(ISD::FEXP, VT, Expand);
- setOperationAction(ISD::FLOG2, VT, Expand);
- setOperationAction(ISD::FREM, VT, Expand);
- setOperationAction(ISD::FLOG, VT, Expand);
- setOperationAction(ISD::FLOG10, VT, Expand);
- setOperationAction(ISD::FPOW, VT, Expand);
- setOperationAction(ISD::FFLOOR, VT, Expand);
- setOperationAction(ISD::FTRUNC, VT, Expand);
- setOperationAction(ISD::FMUL, VT, Expand);
- setOperationAction(ISD::FMA, VT, Expand);
- setOperationAction(ISD::FRINT, VT, Expand);
- setOperationAction(ISD::FNEARBYINT, VT, Expand);
- setOperationAction(ISD::FSQRT, VT, Expand);
- setOperationAction(ISD::FSIN, VT, Expand);
- setOperationAction(ISD::FSUB, VT, Expand);
- setOperationAction(ISD::FNEG, VT, Expand);
- setOperationAction(ISD::VSELECT, VT, Expand);
- setOperationAction(ISD::SELECT_CC, VT, Expand);
- setOperationAction(ISD::FCOPYSIGN, VT, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
- setOperationAction(ISD::SETCC, VT, Expand);
- setOperationAction(ISD::FCANONICALIZE, VT, Expand);
+ setOperationAction(
+ {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD,
+ ISD::FCEIL, ISD::FCOS, ISD::FDIV, ISD::FEXP2,
+ ISD::FEXP, ISD::FLOG2, ISD::FREM, ISD::FLOG,
+ ISD::FLOG10, ISD::FPOW, ISD::FFLOOR, ISD::FTRUNC,
+ ISD::FMUL, ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
+ ISD::FSQRT, ISD::FSIN, ISD::FSUB, ISD::FNEG,
+ ISD::VSELECT, ISD::SELECT_CC, ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE,
+ ISD::SETCC, ISD::FCANONICALIZE},
+ VT, Expand);
}
// This causes using an unrolled select operation rather than expansion with
@@ -590,26 +465,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
if (AMDGPUBypassSlowDiv)
addBypassSlowDiv(64, 32);
- setTargetDAGCombine(ISD::BITCAST);
- setTargetDAGCombine(ISD::SHL);
- setTargetDAGCombine(ISD::SRA);
- setTargetDAGCombine(ISD::SRL);
- setTargetDAGCombine(ISD::TRUNCATE);
- setTargetDAGCombine(ISD::MUL);
- setTargetDAGCombine(ISD::SMUL_LOHI);
- setTargetDAGCombine(ISD::UMUL_LOHI);
- setTargetDAGCombine(ISD::MULHU);
- setTargetDAGCombine(ISD::MULHS);
- setTargetDAGCombine(ISD::SELECT);
- setTargetDAGCombine(ISD::SELECT_CC);
- setTargetDAGCombine(ISD::STORE);
- setTargetDAGCombine(ISD::FADD);
- setTargetDAGCombine(ISD::FSUB);
- setTargetDAGCombine(ISD::FNEG);
- setTargetDAGCombine(ISD::FABS);
- setTargetDAGCombine(ISD::AssertZext);
- setTargetDAGCombine(ISD::AssertSext);
- setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+ setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
+ ISD::SRA, ISD::SRL,
+ ISD::TRUNCATE, ISD::MUL,
+ ISD::SMUL_LOHI, ISD::UMUL_LOHI,
+ ISD::MULHU, ISD::MULHS,
+ ISD::SELECT, ISD::SELECT_CC,
+ ISD::STORE, ISD::FADD,
+ ISD::FSUB, ISD::FNEG,
+ ISD::FABS, ISD::AssertZext,
+ ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
}
bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
@@ -785,11 +650,11 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
unsigned AS = MN->getAddressSpace();
// Do not shrink an aligned scalar load to sub-dword.
// Scalar engine cannot do sub-dword loads.
- if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
+ if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
(AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
- (isa<LoadSDNode>(N) &&
- AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
+ (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
+ MN->isInvariant())) &&
AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
return false;
@@ -855,6 +720,8 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return true;
return false;
+ case AMDGPUISD::SETCC: // ballot-style instruction
+ return true;
}
return false;
}
@@ -1072,10 +939,9 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
const bool IsByRef = Arg.hasByRefAttr();
Type *BaseArgTy = Arg.getType();
Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
- MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
- if (!Alignment)
- Alignment = DL.getABITypeAlign(MemArgTy);
- MaxAlign = max(Alignment, MaxAlign);
+ Align Alignment = DL.getValueOrABITypeAlignment(
+ IsByRef ? Arg.getParamAlign() : None, MemArgTy);
+ MaxAlign = std::max(Alignment, MaxAlign);
uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
@@ -1415,6 +1281,11 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
(Start == 0 || Start == 4))
return Op;
+ if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) ||
+ (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) &&
+ (Start == 0 || Start == 8))
+ return Op;
+
DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
VT.getVectorNumElements());
@@ -1589,8 +1460,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
unsigned Size = LoMemVT.getStoreSize();
- unsigned BaseAlign = Load->getAlignment();
- unsigned HiAlign = MinAlign(BaseAlign, Size);
+ Align BaseAlign = Load->getAlign();
+ Align HiAlign = commonAlignment(BaseAlign, Size);
SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
Load->getChain(), BasePtr, SrcValue, LoMemVT,
@@ -1628,13 +1499,13 @@ SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
EVT MemVT = Load->getMemoryVT();
SDLoc SL(Op);
const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
- unsigned BaseAlign = Load->getAlignment();
+ Align BaseAlign = Load->getAlign();
unsigned NumElements = MemVT.getVectorNumElements();
// Widen from vec3 to vec4 when the load is at least 8-byte aligned
// or 16-byte fully dereferenceable. Otherwise, split the vector load.
if (NumElements != 3 ||
- (BaseAlign < 8 &&
+ (BaseAlign < Align(8) &&
!SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
return SplitVectorLoad(Op, DAG);
@@ -1681,9 +1552,9 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
- unsigned BaseAlign = Store->getAlignment();
+ Align BaseAlign = Store->getAlign();
unsigned Size = LoMemVT.getStoreSize();
- unsigned HiAlign = MinAlign(BaseAlign, Size);
+ Align HiAlign = commonAlignment(BaseAlign, Size);
SDValue LoStore =
DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
@@ -3003,12 +2874,11 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
// the bytes again are not eliminated in the case of an unaligned copy.
if (!allowsMisalignedMemoryAccesses(
VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
- SDValue Ops[2];
-
if (VT.isVector())
- std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
- else
- std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
+ return SplitVectorLoad(SDValue(LN, 0), DAG);
+
+ SDValue Ops[2];
+ std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
return DAG.getMergeValues(Ops, SDLoc(N));
}
@@ -3059,7 +2929,7 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
if (!allowsMisalignedMemoryAccesses(
VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
if (VT.isVector())
- return scalarizeVectorStore(SN, DAG);
+ return SplitVectorStore(SDValue(SN, 0), DAG);
return expandUnalignedStore(SN, DAG);
}
@@ -3281,8 +3151,9 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
// this improves the ability to match BFE patterns in isel.
if (LHS.getOpcode() == ISD::AND) {
if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
- if (Mask->getAPIntValue().isShiftedMask() &&
- Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
+ unsigned MaskIdx, MaskLen;
+ if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
+ MaskIdx == ShiftAmt) {
return DAG.getNode(
ISD::AND, SL, VT,
DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
@@ -4380,10 +4251,14 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
ExplicitArgOffset;
switch (Param) {
- case GRID_DIM:
+ case FIRST_IMPLICIT:
return ArgOffset;
- case GRID_OFFSET:
- return ArgOffset + 4;
+ case PRIVATE_BASE:
+ return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
+ case SHARED_BASE:
+ return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
+ case QUEUE_PTR:
+ return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
}
llvm_unreachable("unexpected implicit parameter type");
}
@@ -4405,7 +4280,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(TC_RETURN)
NODE_NAME_CASE(TRAP)
NODE_NAME_CASE(RET_FLAG)
- NODE_NAME_CASE(RET_GFX_FLAG)
NODE_NAME_CASE(RETURN_TO_EPILOG)
NODE_NAME_CASE(ENDPGM)
NODE_NAME_CASE(DWORDADDR)
@@ -4485,6 +4359,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
NODE_NAME_CASE(LDS)
+ NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
+ NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(LOAD_D16_HI)
@@ -4580,6 +4456,19 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
return SDValue();
}
+static unsigned workitemIntrinsicDim(unsigned ID) {
+ switch (ID) {
+ case Intrinsic::amdgcn_workitem_id_x:
+ return 0;
+ case Intrinsic::amdgcn_workitem_id_y:
+ return 1;
+ case Intrinsic::amdgcn_workitem_id_z:
+ return 2;
+ default:
+ llvm_unreachable("not a workitem intrinsic");
+ }
+}
+
void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
const SDValue Op, KnownBits &Known,
const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
@@ -4716,6 +4605,14 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
break;
}
+ case Intrinsic::amdgcn_workitem_id_x:
+ case Intrinsic::amdgcn_workitem_id_y:
+ case Intrinsic::amdgcn_workitem_id_z: {
+ unsigned MaxValue = Subtarget->getMaxWorkitemID(
+ DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));
+ Known.Zero.setHighBits(countLeadingZeros(MaxValue));
+ break;
+ }
default:
break;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index b41506157b68..73081483f1c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -320,8 +320,9 @@ public:
enum ImplicitParameter {
FIRST_IMPLICIT,
- GRID_DIM = FIRST_IMPLICIT,
- GRID_OFFSET,
+ PRIVATE_BASE,
+ SHARED_BASE,
+ QUEUE_PTR,
};
/// Helper function that returns the byte offset of the given
@@ -367,9 +368,6 @@ enum NodeType : unsigned {
// Return with values from a non-entry function.
RET_FLAG,
- // Return with values from a non-entry function (AMDGPU_Gfx CC).
- RET_GFX_FLAG,
-
DWORDADDR,
FRACT,
@@ -483,6 +481,9 @@ enum NodeType : unsigned {
CONST_DATA_PTR,
PC_ADD_REL_OFFSET,
LDS,
+ FPTRUNC_ROUND_UPWARD,
+ FPTRUNC_ROUND_DOWNWARD,
+
DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
LOAD_D16_HI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
new file mode 100644
index 000000000000..c9cdbc89f3a4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -0,0 +1,457 @@
+//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SetVector.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-insert-delay-alu"
+
+namespace {
+
+class AMDGPUInsertDelayAlu : public MachineFunctionPass {
+public:
+ static char ID;
+
+ const SIInstrInfo *SII;
+ const TargetRegisterInfo *TRI;
+
+ TargetSchedModel SchedModel;
+
+ AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ // Return true if MI waits for all outstanding VALU instructions to complete.
+ static bool instructionWaitsForVALU(const MachineInstr &MI) {
+ // These instruction types wait for VA_VDST==0 before issuing.
+ const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
+ SIInstrFlags::FLAT | SIInstrFlags::MIMG |
+ SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
+ if (MI.getDesc().TSFlags & VA_VDST_0)
+ return true;
+ if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
+ MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
+ return true;
+ if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ (MI.getOperand(0).getImm() & 0xf000) == 0)
+ return true;
+ return false;
+ }
+
+ // Types of delay that can be encoded in an s_delay_alu instruction.
+ enum DelayType { VALU, TRANS, SALU, OTHER };
+
+ // Get the delay type for an instruction with the specified TSFlags.
+ static DelayType getDelayType(uint64_t TSFlags) {
+ if (TSFlags & SIInstrFlags::TRANS)
+ return TRANS;
+ if (TSFlags & SIInstrFlags::VALU)
+ return VALU;
+ if (TSFlags & SIInstrFlags::SALU)
+ return SALU;
+ return OTHER;
+ }
+
+ // Information about the last instruction(s) that wrote to a particular
+ // regunit. In straight-line code there will only be one such instruction, but
+ // when control flow converges we merge the delay information from each path
+ // to represent the union of the worst-case delays of each type.
+ struct DelayInfo {
+ // One larger than the maximum number of (non-TRANS) VALU instructions we
+ // can encode in an s_delay_alu instruction.
+ static const unsigned VALU_MAX = 5;
+
+ // One larger than the maximum number of TRANS instructions we can encode in
+ // an s_delay_alu instruction.
+ static const unsigned TRANS_MAX = 4;
+
+ // If it was written by a (non-TRANS) VALU, remember how many clock cycles
+ // are left until it completes, and how many other (non-TRANS) VALU we have
+ // seen since it was issued.
+ uint8_t VALUCycles = 0;
+ uint8_t VALUNum = VALU_MAX;
+
+ // If it was written by a TRANS, remember how many clock cycles are left
+ // until it completes, and how many other TRANS we have seen since it was
+ // issued.
+ uint8_t TRANSCycles = 0;
+ uint8_t TRANSNum = TRANS_MAX;
+ // Also remember how many other (non-TRANS) VALU we have seen since it was
+ // issued. When an instruction depends on both a prior TRANS and a prior
+ // non-TRANS VALU, this is used to decide whether to encode a wait for just
+ // one or both of them.
+ uint8_t TRANSNumVALU = VALU_MAX;
+
+ // If it was written by an SALU, remember how many clock cycles are left
+ // until it completes.
+ uint8_t SALUCycles = 0;
+
+ DelayInfo() = default;
+
+ DelayInfo(DelayType Type, unsigned Cycles) {
+ switch (Type) {
+ default:
+ llvm_unreachable("unexpected type");
+ case VALU:
+ VALUCycles = Cycles;
+ VALUNum = 0;
+ break;
+ case TRANS:
+ TRANSCycles = Cycles;
+ TRANSNum = 0;
+ TRANSNumVALU = 0;
+ break;
+ case SALU:
+ SALUCycles = Cycles;
+ break;
+ }
+ }
+
+ bool operator==(const DelayInfo &RHS) const {
+ return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
+ TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
+ TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
+ }
+
+ bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
+
+ // Merge another DelayInfo into this one, to represent the union of the
+ // worst-case delays of each type.
+ void merge(const DelayInfo &RHS) {
+ VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
+ VALUNum = std::min(VALUNum, RHS.VALUNum);
+ TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
+ TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
+ TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
+ SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
+ }
+
+ // Update this DelayInfo after issuing an instruction. IsVALU should be 1
+ // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
+ // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
+ // instruction. Return true if there is no longer any useful delay info.
+ bool advance(DelayType Type, unsigned Cycles) {
+ bool Erase = true;
+
+ VALUNum += (Type == VALU);
+ if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
+ // Forget about the VALU instruction. It was too far back or has
+ // definitely completed by now.
+ VALUNum = VALU_MAX;
+ VALUCycles = 0;
+ } else {
+ VALUCycles -= Cycles;
+ Erase = false;
+ }
+
+ TRANSNum += (Type == TRANS);
+ TRANSNumVALU += (Type == VALU);
+ if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
+ // Forget about any TRANS instruction. It was too far back or has
+ // definitely completed by now.
+ TRANSNum = TRANS_MAX;
+ TRANSNumVALU = VALU_MAX;
+ TRANSCycles = 0;
+ } else {
+ TRANSCycles -= Cycles;
+ Erase = false;
+ }
+
+ if (SALUCycles <= Cycles) {
+ // Forget about any SALU instruction. It has definitely completed by
+ // now.
+ SALUCycles = 0;
+ } else {
+ SALUCycles -= Cycles;
+ Erase = false;
+ }
+
+ return Erase;
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump() const {
+ if (VALUCycles)
+ dbgs() << " VALUCycles=" << (int)VALUCycles;
+ if (VALUNum < VALU_MAX)
+ dbgs() << " VALUNum=" << (int)VALUNum;
+ if (TRANSCycles)
+ dbgs() << " TRANSCycles=" << (int)TRANSCycles;
+ if (TRANSNum < TRANS_MAX)
+ dbgs() << " TRANSNum=" << (int)TRANSNum;
+ if (TRANSNumVALU < VALU_MAX)
+ dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
+ if (SALUCycles)
+ dbgs() << " SALUCycles=" << (int)SALUCycles;
+ }
+#endif
+ };
+
+ // A map from regunits to the delay info for that regunit.
+ struct DelayState : DenseMap<unsigned, DelayInfo> {
+ // Merge another DelayState into this one by merging the delay info for each
+ // regunit.
+ void merge(const DelayState &RHS) {
+ for (const auto &KV : RHS) {
+ iterator It;
+ bool Inserted;
+ std::tie(It, Inserted) = insert(KV);
+ if (!Inserted)
+ It->second.merge(KV.second);
+ }
+ }
+
+ // Advance the delay info for each regunit, erasing any that are no longer
+ // useful.
+ void advance(DelayType Type, unsigned Cycles) {
+ iterator Next;
+ for (auto I = begin(), E = end(); I != E; I = Next) {
+ Next = std::next(I);
+ if (I->second.advance(Type, Cycles))
+ erase(I);
+ }
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump(const TargetRegisterInfo *TRI) const {
+ if (empty()) {
+ dbgs() << " empty\n";
+ return;
+ }
+
+ // Dump DelayInfo for each RegUnit in numerical order.
+ SmallVector<const_iterator, 8> Order;
+ Order.reserve(size());
+ for (const_iterator I = begin(), E = end(); I != E; ++I)
+ Order.push_back(I);
+ llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
+ return A->first < B->first;
+ });
+ for (const_iterator I : Order) {
+ dbgs() << " " << printRegUnit(I->first, TRI);
+ I->second.dump();
+ dbgs() << "\n";
+ }
+ }
+#endif
+ };
+
+ // The saved delay state at the end of each basic block.
+ DenseMap<MachineBasicBlock *, DelayState> BlockState;
+
+ // Emit an s_delay_alu instruction if necessary before MI.
+ MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
+ MachineInstr *LastDelayAlu) {
+ unsigned Imm = 0;
+
+ // Wait for a TRANS instruction.
+ if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
+ Imm |= 4 + Delay.TRANSNum;
+
+ // Wait for a VALU instruction (if it's more recent than any TRANS
+ // instruction that we're also waiting for).
+ if (Delay.VALUNum < DelayInfo::VALU_MAX &&
+ Delay.VALUNum <= Delay.TRANSNumVALU) {
+ if (Imm & 0xf)
+ Imm |= Delay.VALUNum << 7;
+ else
+ Imm |= Delay.VALUNum;
+ }
+
+ // Wait for an SALU instruction.
+ if (Delay.SALUCycles) {
+ if (Imm & 0x780) {
+ // We have already encoded a VALU and a TRANS delay. There's no room in
+ // the encoding for an SALU delay as well, so just drop it.
+ } else if (Imm & 0xf) {
+ Imm |= (Delay.SALUCycles + 8) << 7;
+ } else {
+ Imm |= Delay.SALUCycles + 8;
+ }
+ }
+
+ // Don't emit the s_delay_alu instruction if there's nothing to wait for.
+ if (!Imm)
+ return LastDelayAlu;
+
+ // If we only need to wait for one instruction, try encoding it in the last
+ // s_delay_alu that we emitted.
+ if (!(Imm & 0x780) && LastDelayAlu) {
+ unsigned Skip = 0;
+ for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
+ E = MachineBasicBlock::instr_iterator(MI);
+ ++I != E;) {
+ if (!I->isBundle() && !I->isMetaInstruction())
+ ++Skip;
+ }
+ if (Skip < 6) {
+ MachineOperand &Op = LastDelayAlu->getOperand(0);
+ unsigned LastImm = Op.getImm();
+ assert((LastImm & ~0xf) == 0 &&
+ "Remembered an s_delay_alu with no room for another delay!");
+ LastImm |= Imm << 7 | Skip << 4;
+ Op.setImm(LastImm);
+ return nullptr;
+ }
+ }
+
+ auto &MBB = *MI.getParent();
+ MachineInstr *DelayAlu =
+ BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
+ // Remember the s_delay_alu for next time if there is still room in it to
+ // encode another delay.
+ return (Imm & 0x780) ? nullptr : DelayAlu;
+ }
+
+ bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
+ DelayState State;
+ for (auto *Pred : MBB.predecessors())
+ State.merge(BlockState[Pred]);
+
+ LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
+ << "\n";
+ State.dump(TRI););
+
+ bool Changed = false;
+ MachineInstr *LastDelayAlu = nullptr;
+
+ // Iterate over the contents of bundles, but don't emit any instructions
+ // inside a bundle.
+ for (auto &MI : MBB.instrs()) {
+ if (MI.isBundle() || MI.isMetaInstruction())
+ continue;
+
+ // Ignore some more instructions that do not generate any code.
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_RETURN_TO_EPILOG:
+ continue;
+ }
+
+ DelayType Type = getDelayType(MI.getDesc().TSFlags);
+
+ if (instructionWaitsForVALU(MI)) {
+ // Forget about all outstanding VALU delays.
+ State = DelayState();
+ } else if (Type != OTHER) {
+ DelayInfo Delay;
+ // TODO: Scan implicit uses too?
+ for (const auto &Op : MI.explicit_uses()) {
+ if (Op.isReg()) {
+ // One of the operands of the writelane is also the output operand.
+ // This creates the insertion of redundant delays. Hence, we have to
+ // ignore this operand.
+ if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
+ continue;
+ for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
+ auto It = State.find(*UI);
+ if (It != State.end()) {
+ Delay.merge(It->second);
+ State.erase(*UI);
+ }
+ }
+ }
+ }
+ if (Emit && !MI.isBundledWithPred()) {
+ // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
+ // just ignore them?
+ LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
+ }
+ }
+
+ if (Type != OTHER) {
+ // TODO: Scan implicit defs too?
+ for (const auto &Op : MI.defs()) {
+ unsigned Latency = SchedModel.computeOperandLatency(
+ &MI, MI.getOperandNo(&Op), nullptr, 0);
+ for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
+ State[*UI] = DelayInfo(Type, Latency);
+ }
+ }
+
+ // Advance by the number of cycles it takes to issue this instruction.
+ // TODO: Use a more advanced model that accounts for instructions that
+ // take multiple cycles to issue on a particular pipeline.
+ unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
+ // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
+ // instructions on the assumption that they will usually have to be issued
+ // twice?
+ State.advance(Type, Cycles);
+
+ LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
+ }
+
+ if (Emit) {
+ assert(State == BlockState[&MBB] &&
+ "Basic block state should not have changed on final pass!");
+ } else if (State != BlockState[&MBB]) {
+ BlockState[&MBB] = std::move(State);
+ Changed = true;
+ }
+ return Changed;
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
+ << "\n");
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasDelayAlu())
+ return false;
+
+ SII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+
+ SchedModel.init(&ST);
+
+ // Calculate the delay state for each basic block, iterating until we reach
+ // a fixed point.
+ SetVector<MachineBasicBlock *> WorkList;
+ for (auto &MBB : reverse(MF))
+ WorkList.insert(&MBB);
+ while (!WorkList.empty()) {
+ auto &MBB = *WorkList.pop_back_val();
+ bool Changed = runOnMachineBasicBlock(MBB, false);
+ if (Changed)
+ WorkList.insert(MBB.succ_begin(), MBB.succ_end());
+ }
+
+ LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
+
+ // Make one last pass over all basic blocks to emit s_delay_alu
+ // instructions.
+ bool Changed = false;
+ for (auto &MBB : MF)
+ Changed |= runOnMachineBasicBlock(MBB, true);
+ return Changed;
+ }
+};
+
+} // namespace
+
+char AMDGPUInsertDelayAlu::ID = 0;
+
+char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
+
+INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
+ false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4f1d700bcd84..695093322a01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -110,33 +110,42 @@ static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
llvm_unreachable("Should never be called!");
}
-/// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with
-/// the modified arguments.
+/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
+/// modified arguments (based on OldIntr) and replaces InstToReplace with
+/// this newly created intrinsic call.
static Optional<Instruction *> modifyIntrinsicCall(
- IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC,
+ IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
+ InstCombiner &IC,
std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
Func) {
SmallVector<Type *, 4> ArgTys;
- if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
+ if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
return None;
- SmallVector<Value *, 8> Args(II.args());
+ SmallVector<Value *, 8> Args(OldIntr.args());
// Modify arguments and types
Func(Args, ArgTys);
- Function *I = Intrinsic::getDeclaration(II.getModule(), NewIntr, ArgTys);
+ Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
CallInst *NewCall = IC.Builder.CreateCall(I, Args);
- NewCall->takeName(&II);
- NewCall->copyMetadata(II);
+ NewCall->takeName(&OldIntr);
+ NewCall->copyMetadata(OldIntr);
if (isa<FPMathOperator>(NewCall))
- NewCall->copyFastMathFlags(&II);
+ NewCall->copyFastMathFlags(&OldIntr);
// Erase and replace uses
- if (!II.getType()->isVoidTy())
- IC.replaceInstUsesWith(II, NewCall);
- return IC.eraseInstFromFunction(II);
+ if (!InstToReplace.getType()->isVoidTy())
+ IC.replaceInstUsesWith(InstToReplace, NewCall);
+
+ bool RemoveOldIntr = &OldIntr != &InstToReplace;
+
+ auto RetValue = IC.eraseInstFromFunction(InstToReplace);
+ if (RemoveOldIntr)
+ IC.eraseInstFromFunction(OldIntr);
+
+ return RetValue;
}
static Optional<Instruction *>
@@ -153,7 +162,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
ImageDimIntr->Dim);
return modifyIntrinsicCall(
- II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+ II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
Args.erase(Args.begin() + ImageDimIntr->LodIndex);
});
}
@@ -170,7 +179,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
ImageDimIntr->Dim);
return modifyIntrinsicCall(
- II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+ II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
Args.erase(Args.begin() + ImageDimIntr->MipIndex);
});
}
@@ -187,7 +196,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
ImageDimIntr->Dim);
return modifyIntrinsicCall(
- II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+ II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
});
@@ -205,13 +214,41 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
AMDGPU::getImageDimIntrinsicByBaseOpcode(
OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
return modifyIntrinsicCall(
- II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+ II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
});
}
}
}
+ // Try to use D16
+ if (ST->hasD16Images()) {
+
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
+
+ if (BaseOpcode->HasD16) {
+
+ // If the only use of image intrinsic is a fptrunc (with conversion to
+ // half) then both fptrunc and image intrinsic will be replaced with image
+ // intrinsic with D16 flag.
+ if (II.hasOneUse()) {
+ Instruction *User = II.user_back();
+
+ if (User->getOpcode() == Instruction::FPTrunc &&
+ User->getType()->getScalarType()->isHalfTy()) {
+
+ return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
+ [&](auto &Args, auto &ArgTys) {
+ // Change return type of image intrinsic.
+ // Set it to return type of fptrunc.
+ ArgTys[0] = User->getType();
+ });
+ }
+ }
+ }
+ }
+
// Try to use A16 or G16
if (!ST->hasA16() && !ST->hasG16())
return None;
@@ -263,7 +300,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
: Type::getInt16Ty(II.getContext());
return modifyIntrinsicCall(
- II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
+ II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
if (!OnlyDerivatives) {
ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
@@ -584,6 +621,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceInstUsesWith(II, RightShift);
}
case Intrinsic::amdgcn_exp:
+ case Intrinsic::amdgcn_exp_row:
case Intrinsic::amdgcn_exp_compr: {
ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
unsigned EnBits = En->getZExtValue();
@@ -882,6 +920,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
}
+ case Intrinsic::amdgcn_permlane64:
+ // A constant value is trivially uniform.
+ if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
+ return IC.replaceInstUsesWith(II, C);
+ }
+ break;
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
// A constant value is trivially uniform.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 391dc8428539..23b8fcf75f16 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -355,11 +355,7 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
- [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
->;
-
-def AMDGPUret_gfx_flag : SDNode<"AMDGPUISD::RET_GFX_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b7d0f0580cda..3f242fdb6d8e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -80,8 +81,11 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
if (RC) {
const LLT Ty = MRI.getType(Reg);
- return RC->hasSuperClassEq(TRI.getBoolRC()) &&
- Ty.isValid() && Ty.getSizeInBits() == 1;
+ if (!Ty.isValid() || Ty.getSizeInBits() != 1)
+ return false;
+ // G_TRUNC s1 result is never vcc.
+ return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
+ RC->hasSuperClassEq(TRI.getBoolRC());
}
const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
@@ -91,7 +95,7 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
unsigned NewOpc) const {
MI.setDesc(TII.get(NewOpc));
- MI.RemoveOperand(1); // Remove intrinsic ID.
+ MI.removeOperand(1); // Remove intrinsic ID.
MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
MachineOperand &Dst = MI.getOperand(0);
@@ -216,7 +220,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
}
const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
- DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
+ DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
if (!DefRC) {
LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
return false;
@@ -454,6 +458,24 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
return true;
}
+bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
+ MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
+
+ unsigned Opc;
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11)
+ Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
+ : AMDGPU::V_MAD_I64_I32_gfx11_e64;
+ else
+ Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
+ I.setDesc(TII.get(Opc));
+ I.addOperand(*MF, MachineOperand::CreateImm(0));
+ I.addImplicitDefUseOperands(*MF);
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
// TODO: We should probably legalize these to only using 32-bit results.
bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
@@ -481,7 +503,7 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
const TargetRegisterClass *SrcRC =
- TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
+ TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
if (!SrcRC)
return false;
unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
@@ -514,7 +536,7 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
const unsigned DstSize = DstTy.getSizeInBits();
const TargetRegisterClass *DstRC =
- TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+ TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
if (!DstRC)
return false;
@@ -556,7 +578,7 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
const TargetRegisterClass *SrcRC =
- TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
+ TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
return false;
@@ -630,7 +652,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
MI.setDesc(TII.get(AMDGPU::COPY));
- MI.RemoveOperand(2);
+ MI.removeOperand(2);
return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
}
@@ -643,6 +665,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
//
// (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
// => (S_PACK_HH_B32_B16 $src0, $src1)
+ // (build_vector_trunc (lshr_oneuse SReg_32:$src0, 16), $src1)
+ // => (S_PACK_HL_B32_B16 $src0, $src1)
// (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
// => (S_PACK_LH_B32_B16 $src0, $src1)
// (build_vector_trunc $src0, $src1)
@@ -662,14 +686,20 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
} else if (Shift1) {
Opc = AMDGPU::S_PACK_LH_B32_B16;
MI.getOperand(2).setReg(ShiftSrc1);
- } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
- // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
- auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
- .addReg(ShiftSrc0)
- .addImm(16);
+ } else if (Shift0) {
+ if (ConstSrc1 && ConstSrc1->Value == 0) {
+ // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
+ auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
+ .addReg(ShiftSrc0)
+ .addImm(16);
- MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ }
+ if (STI.hasSPackHL()) {
+ Opc = AMDGPU::S_PACK_HL_B32_B16;
+ MI.getOperand(1).setReg(ShiftSrc0);
+ }
}
MI.setDesc(TII.get(Opc));
@@ -722,16 +752,16 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
const TargetRegisterClass *DstRC =
- TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+ TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
if (!DstRC)
return false;
const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
const TargetRegisterClass *Src0RC =
- TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
+ TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
const TargetRegisterClass *Src1RC =
- TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
+ TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
// Deal with weird cases where the class only partially supports the subreg
// index.
@@ -970,6 +1000,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return selectGroupStaticSize(I);
case Intrinsic::returnaddress:
return selectReturnAddress(I);
+ case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
+ case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
+ case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
+ case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
+ return selectSMFMACIntrin(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -1142,7 +1179,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
Optional<ValueAndVReg> Arg =
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
- if (Arg.hasValue()) {
+ if (Arg) {
const int64_t Value = Arg.getValue().Value.getSExtValue();
if (Value == 0) {
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
@@ -1164,8 +1201,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
- const TargetRegisterClass *DstRC =
- TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
+ const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
return false;
@@ -1300,12 +1336,14 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
unsigned Offset0 = OrderedCountIndex << 2;
- unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
- (Instruction << 4);
+ unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
Offset1 |= (CountDw - 1) << 6;
+ if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
+ Offset1 |= ShaderType << 2;
+
unsigned Offset = Offset0 | (Offset1 << 8);
Register M0Val = MI.getOperand(2).getReg();
@@ -1424,23 +1462,7 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
if (HasVSrc) {
Register VSrc = MI.getOperand(1).getReg();
-
- if (STI.needsAlignedVGPRs()) {
- // Add implicit aligned super-reg to force alignment on the data operand.
- Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
- Register NewVR =
- MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
- BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR)
- .addReg(VSrc, 0, MI.getOperand(1).getSubReg())
- .addImm(AMDGPU::sub0)
- .addReg(Undef)
- .addImm(AMDGPU::sub1);
- MIB.addReg(NewVR, 0, AMDGPU::sub0);
- MIB.addReg(NewVR, RegState::Implicit);
- } else {
- MIB.addReg(VSrc);
- }
+ MIB.addReg(VSrc);
if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
return false;
@@ -1449,6 +1471,8 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
MIB.addImm(ImmOffset)
.cloneMemRefs(MI);
+ TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
+
MI.eraseFromParent();
return true;
}
@@ -1523,6 +1547,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
unsigned IntrOpcode = Intr->BaseOpcode;
const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
+ const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
@@ -1627,7 +1652,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
}
// The legalizer preprocessed the intrinsic arguments. If we aren't using
- // NSA, these should have beeen packed into a single value in the first
+ // NSA, these should have been packed into a single value in the first
// address register
const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
@@ -1639,13 +1664,29 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
++NumVDataDwords;
int Opcode = -1;
- if (IsGFX10Plus) {
+ if (IsGFX11Plus) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
+ UseNSA ? AMDGPU::MIMGEncGfx11NSA
+ : AMDGPU::MIMGEncGfx11Default,
+ NumVDataDwords, NumVAddrDwords);
+ } else if (IsGFX10Plus) {
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
UseNSA ? AMDGPU::MIMGEncGfx10NSA
: AMDGPU::MIMGEncGfx10Default,
NumVDataDwords, NumVAddrDwords);
} else {
- if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->hasGFX90AInsts()) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
+ NumVDataDwords, NumVAddrDwords);
+ if (Opcode == -1) {
+ LLVM_DEBUG(
+ dbgs()
+ << "requested image instruction is not supported on this GPU\n");
+ return false;
+ }
+ }
+ if (Opcode == -1 &&
+ STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
NumVDataDwords, NumVAddrDwords);
if (Opcode == -1)
@@ -1703,7 +1744,13 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
if (IsGFX10Plus)
MIB.addImm(IsA16 ? -1 : 0);
- MIB.addImm(TFE); // tfe
+ if (!Subtarget->hasGFX90AInsts()) {
+ MIB.addImm(TFE); // tfe
+ } else if (TFE) {
+ LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
+ return false;
+ }
+
MIB.addImm(LWE); // lwe
if (!IsGFX10Plus)
MIB.addImm(DimInfo->DA ? -1 : 0);
@@ -1743,7 +1790,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
}
MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
+ return true;
}
bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
@@ -1770,10 +1819,22 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectSBarrier(I);
case Intrinsic::amdgcn_global_atomic_fadd:
return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
- default: {
- return selectImpl(I, *CoverageInfo);
- }
+ case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_lds:
+ return selectBufferLoadLds(I);
+ case Intrinsic::amdgcn_global_load_lds:
+ return selectGlobalLoadLds(I);
+ case Intrinsic::amdgcn_exp_compr:
+ if (!STI.hasCompressedExport()) {
+ Function &F = I.getMF()->getFunction();
+ DiagnosticInfoUnsupported NoFpRet(
+ F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
+ F.getContext().diagnose(NoFpRet);
+ return false;
+ }
+ break;
}
+ return selectImpl(I, *CoverageInfo);
}
bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
@@ -1872,10 +1933,10 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = SrcTy.getSizeInBits();
- const TargetRegisterClass *SrcRC
- = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
- const TargetRegisterClass *DstRC
- = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
+ const TargetRegisterClass *SrcRC =
+ TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
+ const TargetRegisterClass *DstRC =
+ TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
if (!SrcRC || !DstRC)
return false;
@@ -2014,10 +2075,10 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
return selectCOPY(I);
const TargetRegisterClass *SrcRC =
- TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
+ TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
const TargetRegisterClass *DstRC =
- TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+ TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
Register UndefReg = MRI->createVirtualRegister(SrcRC);
BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
@@ -2384,65 +2445,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
return selectImpl(I, *CoverageInfo);
}
-// TODO: No rtn optimization.
-bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
- MachineInstr &MI) const {
- Register PtrReg = MI.getOperand(1).getReg();
- const LLT PtrTy = MRI->getType(PtrReg);
- if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
- STI.useFlatForGlobal())
- return selectImpl(MI, *CoverageInfo);
-
- Register DstReg = MI.getOperand(0).getReg();
- const LLT Ty = MRI->getType(DstReg);
- const bool Is64 = Ty.getSizeInBits() == 64;
- const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
- Register TmpReg = MRI->createVirtualRegister(
- Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
-
- const DebugLoc &DL = MI.getDebugLoc();
- MachineBasicBlock *BB = MI.getParent();
-
- Register VAddr, RSrcReg, SOffset;
- int64_t Offset = 0;
-
- unsigned Opcode;
- if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
- Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
- AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
- } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
- RSrcReg, SOffset, Offset)) {
- Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
- AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
- } else
- return selectImpl(MI, *CoverageInfo);
-
- auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
- .addReg(MI.getOperand(2).getReg());
-
- if (VAddr)
- MIB.addReg(VAddr);
-
- MIB.addReg(RSrcReg);
- if (SOffset)
- MIB.addReg(SOffset);
- else
- MIB.addImm(0);
-
- MIB.addImm(Offset);
- MIB.addImm(AMDGPU::CPol::GLC);
- MIB.cloneMemRefs(MI);
-
- BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
- .addReg(TmpReg, RegState::Kill, SubReg);
-
- MI.eraseFromParent();
-
- MRI->setRegClass(
- DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
-}
-
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
if (Reg.isPhysical())
return false;
@@ -2551,7 +2553,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
// Try to avoid emitting a bit operation when we only need to touch half of
// the 64-bit pointer.
- APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
+ APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zext(64);
const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
@@ -2571,12 +2573,10 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
const TargetRegisterClass &RegRC
= IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
- const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
- *MRI);
- const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
- *MRI);
+ const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
+ const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
const TargetRegisterClass *MaskRC =
- TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
+ TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
@@ -2689,10 +2689,10 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
return false;
- const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
- *MRI);
- const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
- *MRI);
+ const TargetRegisterClass *SrcRC =
+ TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
+ const TargetRegisterClass *DstRC =
+ TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
if (!SrcRC || !DstRC)
return false;
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
@@ -2771,10 +2771,10 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
return false;
- const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
- *MRI);
- const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
- *MRI);
+ const TargetRegisterClass *VecRC =
+ TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
+ const TargetRegisterClass *ValRC =
+ TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
!RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
@@ -2867,7 +2867,6 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
return false;
assert(ShufMask.size() == 2);
- assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -2924,17 +2923,28 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
}
} else if (Mask[0] == 0 && Mask[1] == 0) {
if (IsVALU) {
- // Write low half of the register into the high half.
- MachineInstr *MovSDWA =
- BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
- .addImm(0) // $src0_modifiers
- .addReg(SrcVec) // $src0
- .addImm(0) // $clamp
- .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
- .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
- .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
- .addReg(SrcVec, RegState::Implicit);
- MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ if (STI.hasSDWA()) {
+ // Write low half of the register into the high half.
+ MachineInstr *MovSDWA =
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+ .addImm(0) // $src0_modifiers
+ .addReg(SrcVec) // $src0
+ .addImm(0) // $clamp
+ .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
+ .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+ .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
+ .addReg(SrcVec, RegState::Implicit);
+ MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ } else {
+ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
+ .addImm(0xFFFF)
+ .addReg(SrcVec);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), DstReg)
+ .addReg(TmpReg)
+ .addImm(16)
+ .addReg(TmpReg);
+ }
} else {
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
.addReg(SrcVec)
@@ -2942,17 +2952,28 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
}
} else if (Mask[0] == 1 && Mask[1] == 1) {
if (IsVALU) {
- // Write high half of the register into the low half.
- MachineInstr *MovSDWA =
- BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
- .addImm(0) // $src0_modifiers
- .addReg(SrcVec) // $src0
- .addImm(0) // $clamp
- .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
- .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
- .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
- .addReg(SrcVec, RegState::Implicit);
- MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ if (STI.hasSDWA()) {
+ // Write high half of the register into the low half.
+ MachineInstr *MovSDWA =
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+ .addImm(0) // $src0_modifiers
+ .addReg(SrcVec) // $src0
+ .addImm(0) // $clamp
+ .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
+ .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+ .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
+ .addReg(SrcVec, RegState::Implicit);
+ MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ } else {
+ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+ .addImm(16)
+ .addReg(SrcVec);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), DstReg)
+ .addReg(TmpReg)
+ .addImm(16)
+ .addReg(TmpReg);
+ }
} else {
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
.addReg(SrcVec)
@@ -2965,13 +2986,19 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
.addReg(SrcVec)
.addImm(16);
} else {
- Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
- .addReg(SrcVec)
- .addImm(16);
- BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
- .addReg(TmpReg)
- .addReg(SrcVec);
+ if (STI.hasSPackHL()) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HL_B32_B16), DstReg)
+ .addReg(SrcVec)
+ .addReg(SrcVec);
+ } else {
+ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
+ .addReg(SrcVec)
+ .addImm(16);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
+ .addReg(TmpReg)
+ .addReg(SrcVec);
+ }
}
} else
llvm_unreachable("all shuffle masks should be handled");
@@ -2982,13 +3009,15 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
MachineInstr &MI) const {
- if (STI.hasGFX90AInsts())
+ const Register DefReg = MI.getOperand(0).getReg();
+ LLT DefTy = MRI->getType(DefReg);
+ if (AMDGPU::hasAtomicFaddRtnForTy(STI, DefTy))
return selectImpl(MI, *CoverageInfo);
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
+ if (!MRI->use_nodbg_empty(DefReg)) {
Function &F = MBB->getParent()->getFunction();
DiagnosticInfoUnsupported
NoFpRet(F, "return versions of fp atomics not supported",
@@ -3105,9 +3134,236 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
+bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
+ unsigned Opc;
+ unsigned Size = MI.getOperand(3).getImm();
+
+ // The struct intrinsic variants add one additional operand over raw.
+ const bool HasVIndex = MI.getNumOperands() == 9;
+ Register VIndex;
+ int OpOffset = 0;
+ if (HasVIndex) {
+ VIndex = MI.getOperand(4).getReg();
+ OpOffset = 1;
+ }
+
+ Register VOffset = MI.getOperand(4 + OpOffset).getReg();
+ Optional<ValueAndVReg> MaybeVOffset =
+ getIConstantVRegValWithLookThrough(VOffset, *MRI);
+ const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
+
+ switch (Size) {
+ default:
+ return false;
+ case 1:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
+ break;
+ case 2:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
+ break;
+ case 4:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
+ break;
+ }
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .add(MI.getOperand(2));
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
+
+ if (HasVIndex && HasVOffset) {
+ Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
+ BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
+ .addReg(VIndex)
+ .addImm(AMDGPU::sub0)
+ .addReg(VOffset)
+ .addImm(AMDGPU::sub1);
+
+ MIB.addReg(IdxReg);
+ } else if (HasVIndex) {
+ MIB.addReg(VIndex);
+ } else if (HasVOffset) {
+ MIB.addReg(VOffset);
+ }
+
+ MIB.add(MI.getOperand(1)); // rsrc
+ MIB.add(MI.getOperand(5 + OpOffset)); // soffset
+ MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
+ unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
+ MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
+ MIB.addImm((Aux >> 3) & 1); // swz
+
+ MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+ MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+ LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
+ MachinePointerInfo StorePtrI = LoadPtrI;
+ StorePtrI.V = nullptr;
+ StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+
+ auto F = LoadMMO->getFlags() &
+ ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+ LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+ Size, LoadMMO->getBaseAlign());
+
+ MachineMemOperand *StoreMMO =
+ MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+ sizeof(int32_t), LoadMMO->getBaseAlign());
+
+ MIB.setMemRefs({LoadMMO, StoreMMO});
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
+/// Match a zero extend from a 32-bit value to 64-bits.
+static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
+ Register ZExtSrc;
+ if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
+ return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+
+ // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+ if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+ return false;
+
+ if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
+ return Def->getOperand(1).getReg();
+ }
+
+ return Register();
+}
+
+bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
+ unsigned Opc;
+ unsigned Size = MI.getOperand(3).getImm();
+
+ switch (Size) {
+ default:
+ return false;
+ case 1:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
+ break;
+ case 2:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
+ break;
+ case 4:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
+ break;
+ }
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .add(MI.getOperand(2));
+
+ Register Addr = MI.getOperand(1).getReg();
+ Register VOffset;
+ // Try to split SAddr and VOffset. Global and LDS pointers share the same
+ // immediate offset, so we cannot use a regular SelectGlobalSAddr().
+ if (!isSGPR(Addr)) {
+ auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
+ if (isSGPR(AddrDef->Reg)) {
+ Addr = AddrDef->Reg;
+ } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
+ Register SAddr =
+ getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
+ if (SAddr && isSGPR(SAddr)) {
+ Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
+ if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ Addr = SAddr;
+ VOffset = Off;
+ }
+ }
+ }
+ }
+
+ if (isSGPR(Addr)) {
+ Opc = AMDGPU::getGlobalSaddrOp(Opc);
+ if (!VOffset) {
+ VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
+ .addImm(0);
+ }
+ }
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
+ .addReg(Addr);
+
+ if (isSGPR(Addr))
+ MIB.addReg(VOffset);
+
+ MIB.add(MI.getOperand(4)) // offset
+ .add(MI.getOperand(5)); // cpol
+
+ MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+ MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+ LoadPtrI.Offset = MI.getOperand(4).getImm();
+ MachinePointerInfo StorePtrI = LoadPtrI;
+ LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
+ StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+ auto F = LoadMMO->getFlags() &
+ ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+ LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+ Size, LoadMMO->getBaseAlign());
+ MachineMemOperand *StoreMMO =
+ MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+ sizeof(int32_t), Align(4));
+
+ MIB.setMemRefs({LoadMMO, StoreMMO});
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
MI.setDesc(TII.get(MI.getOperand(1).getImm()));
- MI.RemoveOperand(1);
+ MI.removeOperand(1);
+ MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
+ unsigned Opc;
+ switch (MI.getIntrinsicID()) {
+ case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
+ Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
+ Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
+ Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
+ Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
+ Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
+ Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
+ break;
+ default:
+ llvm_unreachable("unhandled smfmac intrinsic");
+ }
+
+ auto VDst_In = MI.getOperand(4);
+
+ MI.setDesc(TII.get(Opc));
+ MI.removeOperand(4); // VDst_In
+ MI.removeOperand(1); // Intrinsic ID
+ MI.addOperand(VDst_In); // Readd VDst_In to the end
MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
return true;
}
@@ -3166,6 +3422,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_UADDE:
case TargetOpcode::G_USUBE:
return selectG_UADDO_USUBO_UADDE_USUBE(I);
+ case AMDGPU::G_AMDGPU_MAD_U64_U32:
+ case AMDGPU::G_AMDGPU_MAD_I64_I32:
+ return selectG_AMDGPU_MAD_64_32(I);
case TargetOpcode::G_INTTOPTR:
case TargetOpcode::G_BITCAST:
case TargetOpcode::G_PTRTOINT:
@@ -3226,8 +3485,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
return selectG_LOAD_STORE_ATOMICRMW(I);
- case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
- return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
case TargetOpcode::G_SELECT:
return selectG_SELECT(I);
case TargetOpcode::G_TRUNC:
@@ -3286,9 +3543,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
}
-std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
- bool AllowAbs) const {
+std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
+ MachineOperand &Root, bool AllowAbs, bool OpSel, bool ForceVGPR) const {
Register Src = Root.getReg();
Register OrigSrc = Src;
unsigned Mods = 0;
@@ -3305,7 +3561,10 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
Mods |= SISrcMods::ABS;
}
- if (Mods != 0 &&
+ if (OpSel)
+ Mods |= SISrcMods::OP_SEL_0;
+
+ if ((Mods != 0 || ForceVGPR) &&
RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
MachineInstr *UseMI = Root.getParent();
@@ -3407,7 +3666,7 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3PModsImpl(
- Register Src, const MachineRegisterInfo &MRI) const {
+ Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
unsigned Mods = 0;
MachineInstr *MI = MRI.getVRegDef(Src);
@@ -3421,6 +3680,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(
}
// TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
+ (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
// Packed instructions do not have abs modifiers.
Mods |= SISrcMods::OP_SEL_1;
@@ -3444,6 +3704,50 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI
+ = Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const {
+ // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
+ // Value is in Imm operand as i1 sign extended to int64_t.
+ // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
+ assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
+ "expected i1 value");
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ if (Root.getImm() == -1)
+ Mods ^= SISrcMods::NEG;
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
+ MachineOperand &Root) const {
+ assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
+ "expected i1 value");
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ if (Root.getImm() != 0)
+ Mods |= SISrcMods::OP_SEL_0;
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
Register Src;
unsigned Mods;
@@ -3467,6 +3771,36 @@ AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ /* AllowAbs */ false,
+ /* OpSel */ false,
+ /* ForceVGPR */ true);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ /* AllowAbs */ false,
+ /* OpSel */ true,
+ /* ForceVGPR */ true);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
SmallVector<GEPInfo, 4> AddrInfo;
getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
@@ -3594,24 +3928,6 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
}};
}
-/// Match a zero extend from a 32-bit value to 64-bits.
-static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
- Register ZExtSrc;
- if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
- return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
-
- // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
- const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
- if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
- return false;
-
- if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
- return Def->getOperand(1).getReg();
- }
-
- return Register();
-}
-
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
@@ -3631,9 +3947,6 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
ImmOffset = ConstOffset;
} else {
auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
- if (!PtrBaseDef)
- return None;
-
if (isSGPR(PtrBaseDef->Reg)) {
if (ConstOffset > 0) {
// Offset is too large.
@@ -3679,11 +3992,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
}
}
- auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
- if (!AddrDef)
- return None;
-
// Match the variable offset.
+ auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
// Look through the SGPR->VGPR copy.
Register SAddr =
@@ -3749,9 +4059,6 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
}
auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
- if (!AddrDef)
- return None;
-
if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
int FI = AddrDef->MI->getOperand(1).getIndex();
return {{
@@ -3768,8 +4075,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
- if (LHSDef && RHSDef &&
- LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
+ if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
isSGPR(RHSDef->Reg)) {
int FI = LHSDef->MI->getOperand(1).getIndex();
MachineInstr &I = *Root.getParent();
@@ -3792,6 +4098,74 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
}};
}
+// Check whether the flat scratch SVS swizzle bug affects this access.
+bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
+ Register VAddr, Register SAddr, uint64_t ImmOffset) const {
+ if (!Subtarget->hasFlatScratchSVSSwizzleBug())
+ return false;
+
+ // The bug affects the swizzling of SVS accesses if there is any carry out
+ // from the two low order bits (i.e. from bit 1 into bit 2) when adding
+ // voffset to (soffset + inst_offset).
+ auto VKnown = KnownBits->getKnownBits(VAddr);
+ auto SKnown = KnownBits::computeForAddSub(
+ true, false, KnownBits->getKnownBits(SAddr),
+ KnownBits::makeConstant(APInt(32, ImmOffset)));
+ uint64_t VMax = VKnown.getMaxValue().getZExtValue();
+ uint64_t SMax = SKnown.getMaxValue().getZExtValue();
+ return (VMax & 3) + (SMax & 3) >= 4;
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
+ Register Addr = Root.getReg();
+ Register PtrBase;
+ int64_t ConstOffset;
+ int64_t ImmOffset = 0;
+
+ // Match the immediate offset first, which canonically is moved as low as
+ // possible.
+ std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+
+ if (ConstOffset != 0 &&
+ TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+ Addr = PtrBase;
+ ImmOffset = ConstOffset;
+ }
+
+ auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
+ if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
+ return None;
+
+ Register RHS = AddrDef->MI->getOperand(2).getReg();
+ if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
+ return None;
+
+ Register LHS = AddrDef->MI->getOperand(1).getReg();
+ auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
+
+ if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
+ return None;
+
+ if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
+ int FI = LHSDef->MI->getOperand(1).getIndex();
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
+ [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ }};
+ }
+
+ if (!isSGPR(LHS))
+ return None;
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ }};
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
@@ -3856,7 +4230,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MIB.addReg(Info->getScratchRSrcReg());
},
[=](MachineInstrBuilder &MIB) { // vaddr
- if (FI.hasValue())
+ if (FI)
MIB.addFrameIndex(FI.getValue());
else
MIB.addReg(VAddr);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 42095332d11a..22672ba59e76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -97,6 +97,7 @@ private:
bool selectG_AND_OR_XOR(MachineInstr &I) const;
bool selectG_ADD_SUB(MachineInstr &I) const;
bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const;
+ bool selectG_AMDGPU_MAD_64_32(MachineInstr &I) const;
bool selectG_EXTRACT(MachineInstr &I) const;
bool selectG_MERGE_VALUES(MachineInstr &I) const;
bool selectG_UNMERGE_VALUES(MachineInstr &I) const;
@@ -133,7 +134,6 @@ private:
void initM0(MachineInstr &I) const;
bool selectG_LOAD_STORE_ATOMICRMW(MachineInstr &I) const;
- bool selectG_AMDGPU_ATOMIC_CMPXCHG(MachineInstr &I) const;
bool selectG_SELECT(MachineInstr &I) const;
bool selectG_BRCOND(MachineInstr &I) const;
bool selectG_GLOBAL_VALUE(MachineInstr &I) const;
@@ -144,11 +144,15 @@ private:
bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp,
MachineOperand &DataOp) const;
+ bool selectBufferLoadLds(MachineInstr &MI) const;
+ bool selectGlobalLoadLds(MachineInstr &MI) const;
bool selectBVHIntrinsic(MachineInstr &I) const;
+ bool selectSMFMACIntrin(MachineInstr &I) const;
bool selectWaveAddress(MachineInstr &I) const;
- std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
- bool AllowAbs = true) const;
+ std::pair<Register, unsigned>
+ selectVOP3ModsImpl(MachineOperand &Root, bool AllowAbs = true,
+ bool OpSel = false, bool ForceVGPR = false) const;
InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand &Root) const;
@@ -173,15 +177,30 @@ private:
selectVOP3Mods_nnan(MachineOperand &Root) const;
std::pair<Register, unsigned>
- selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI) const;
+ selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI,
+ bool IsDOT = false) const;
InstructionSelector::ComplexRendererFns
selectVOP3PMods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectVOP3PModsDOT(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectDotIUVOP3PMods(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectVINTERPMods(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVINTERPModsHi(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectSmrdImm32(MachineOperand &Root) const;
@@ -203,6 +222,10 @@ private:
InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand &Root) const;
+ bool checkFlatScratchSVSSwizzleBug(Register VAddr, Register SAddr,
+ uint64_t ImmOffset) const;
+ InstructionSelector::ComplexRendererFns
+ selectScratchSVAddr(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffen(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 7d3dbfd7e851..31012915457b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -40,7 +40,7 @@ class AMDGPUInst <dag outs, dag ins, string asm = "",
// instructions to not match without killing the whole decode process. It is
// mainly used for ARM, but Tablegen expects this field to exist or it fails
// to build the decode table.
- field bits<64> SoftFail = 0;
+ field bits<96> SoftFail = 0;
let DecoderNamespace = Namespace;
@@ -87,6 +87,17 @@ class PredConcat<list<Predicate> lst, Predicate pred> {
!listconcat([pred], !filter(item, lst, !ne(item, pred)));
}
+// Add a Register to the list if does not already exist
+class RegAppend<list<Register> lst, Register reg> {
+ list<Register> ret =
+ !listconcat([reg], !filter(item, lst, !ne(item, reg)));
+}
+// Get the union of two Register lists
+class RegListUnion<list<Register> lstA, list<Register> lstB> {
+ list<Register> ret =
+ !foldl(lstA, lstB, temp, item, RegAppend<temp, item>.ret);
+}
+
class PredicateControl {
Predicate SubtargetPredicate = TruePredicate;
Predicate AssemblerPredicate = TruePredicate;
@@ -444,34 +455,28 @@ def load_#as : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> {
let IsNonExtLoad = 1;
}
-def extloadi8_#as : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
+def extloadi8_#as : PatFrag<(ops node:$ptr), (extloadi8 node:$ptr)> {
let IsLoad = 1;
- let MemoryVT = i8;
}
-def extloadi16_#as : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
+def extloadi16_#as : PatFrag<(ops node:$ptr), (extloadi16 node:$ptr)> {
let IsLoad = 1;
- let MemoryVT = i16;
}
-def sextloadi8_#as : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
+def sextloadi8_#as : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr)> {
let IsLoad = 1;
- let MemoryVT = i8;
}
-def sextloadi16_#as : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
+def sextloadi16_#as : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr)> {
let IsLoad = 1;
- let MemoryVT = i16;
}
-def zextloadi8_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
+def zextloadi8_#as : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr)> {
let IsLoad = 1;
- let MemoryVT = i8;
}
-def zextloadi16_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
+def zextloadi16_#as : PatFrag<(ops node:$ptr), (zextloadi16 node:$ptr)> {
let IsLoad = 1;
- let MemoryVT = i16;
}
def atomic_load_8_#as : PatFrag<(ops node:$ptr), (atomic_load_8 node:$ptr)> {
@@ -498,17 +503,15 @@ def atomic_load_64_#as : PatFrag<(ops node:$ptr), (atomic_load_64 node:$ptr)> {
foreach as = [ "global", "flat", "local", "private", "region" ] in {
-let AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in {
+let IsStore = 1, AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in {
def store_#as : PatFrag<(ops node:$val, node:$ptr),
(unindexedstore node:$val, node:$ptr)> {
- let IsStore = 1;
let IsTruncStore = 0;
}
// truncstore fragments.
def truncstore_#as : PatFrag<(ops node:$val, node:$ptr),
(unindexedstore node:$val, node:$ptr)> {
- let IsStore = 1;
let IsTruncStore = 1;
}
@@ -517,90 +520,133 @@ def truncstore_#as : PatFrag<(ops node:$val, node:$ptr),
// unnecessary check that the memory size is less than the value type
// in the generated matcher table.
def truncstorei8_#as : PatFrag<(ops node:$val, node:$ptr),
- (truncstore node:$val, node:$ptr)> {
- let IsStore = 1;
- let MemoryVT = i8;
-}
-
+ (truncstorei8 node:$val, node:$ptr)>;
def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr),
- (truncstore node:$val, node:$ptr)> {
- let IsStore = 1;
- let MemoryVT = i16;
-}
+ (truncstorei16 node:$val, node:$ptr)>;
def store_hi16_#as : StoreHi16 <truncstorei16, i16>;
def truncstorei8_hi16_#as : StoreHi16<truncstorei8, i8>;
def truncstorei16_hi16_#as : StoreHi16<truncstorei16, i16>;
-defm atomic_store_#as : binary_atomic_op<atomic_store>;
+} // End let IsStore = 1, AddressSpaces = ...
-} // End let AddressSpaces
+let IsAtomic = 1, AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in {
+def atomic_store_8_#as : PatFrag<(ops node:$ptr, node:$val),
+ (atomic_store_8 node:$ptr, node:$val)>;
+def atomic_store_16_#as : PatFrag<(ops node:$ptr, node:$val),
+ (atomic_store_16 node:$ptr, node:$val)>;
+def atomic_store_32_#as : PatFrag<(ops node:$ptr, node:$val),
+ (atomic_store_32 node:$ptr, node:$val)>;
+def atomic_store_64_#as : PatFrag<(ops node:$ptr, node:$val),
+ (atomic_store_64 node:$ptr, node:$val)>;
+}
} // End foreach as
+// TODO: Add GISelPredicateCode for the ret and noret PatFrags once
+// GlobalISelEmitter allows pattern matches where src and dst def count
+// mismatch.
+
+multiclass ret_noret_op {
+ let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
+ GISelPredicateCode = [{ return true; }] in {
+ def "_ret" : PatFrag<(ops node:$ptr, node:$data),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
+ }
+
+ let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
+ GISelPredicateCode = [{ return false; }] in {
+ def "_noret" : PatFrag<(ops node:$ptr, node:$data),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
+ }
+}
+
+defm int_amdgcn_flat_atomic_fadd : ret_noret_op;
+defm int_amdgcn_flat_atomic_fadd_v2bf16 : ret_noret_op;
+defm int_amdgcn_flat_atomic_fmin : ret_noret_op;
+defm int_amdgcn_flat_atomic_fmax : ret_noret_op;
+defm int_amdgcn_global_atomic_fadd : ret_noret_op;
+defm int_amdgcn_global_atomic_fadd_v2bf16 : ret_noret_op;
+defm int_amdgcn_global_atomic_fmin : ret_noret_op;
+defm int_amdgcn_global_atomic_fmax : ret_noret_op;
+defm int_amdgcn_ds_fadd_v2bf16 : ret_noret_op;
multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
+ let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
+ GISelPredicateCode = [{ return false; }] in {
+ defm "_noret" : binary_atomic_op<atomic_op, IsInt>;
+ }
+
+ let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
+ GISelPredicateCode = [{ return true; }] in {
+ defm "_ret" : binary_atomic_op<atomic_op, IsInt>;
+ }
+}
+
+multiclass ret_noret_ternary_atomic_op<SDNode atomic_op> {
+ let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
+ GISelPredicateCode = [{ return false; }] in {
+ defm "_noret" : ternary_atomic_op<atomic_op>;
+ }
+
+ let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
+ GISelPredicateCode = [{ return true; }] in {
+ defm "_ret" : ternary_atomic_op<atomic_op>;
+ }
+}
+
+multiclass binary_atomic_op_all_as<SDNode atomic_op, bit IsInt = 1> {
foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
defm "_"#as : binary_atomic_op<atomic_op, IsInt>;
-
- let PredicateCode = [{return (SDValue(N, 0).use_empty());}] in {
- defm "_"#as#"_noret" : binary_atomic_op<atomic_op, IsInt>;
- }
-
- let PredicateCode = [{return !(SDValue(N, 0).use_empty());}] in {
- defm "_"#as#"_ret" : binary_atomic_op<atomic_op, IsInt>;
- }
+ defm "_"#as : ret_noret_binary_atomic_op<atomic_op, IsInt>;
}
}
}
-defm atomic_swap : ret_noret_binary_atomic_op<atomic_swap>;
-defm atomic_load_add : ret_noret_binary_atomic_op<atomic_load_add>;
-defm atomic_load_and : ret_noret_binary_atomic_op<atomic_load_and>;
-defm atomic_load_max : ret_noret_binary_atomic_op<atomic_load_max>;
-defm atomic_load_min : ret_noret_binary_atomic_op<atomic_load_min>;
-defm atomic_load_or : ret_noret_binary_atomic_op<atomic_load_or>;
-defm atomic_load_sub : ret_noret_binary_atomic_op<atomic_load_sub>;
-defm atomic_load_umax : ret_noret_binary_atomic_op<atomic_load_umax>;
-defm atomic_load_umin : ret_noret_binary_atomic_op<atomic_load_umin>;
-defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>;
-defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
+defm atomic_swap : binary_atomic_op_all_as<atomic_swap>;
+defm atomic_load_add : binary_atomic_op_all_as<atomic_load_add>;
+defm atomic_load_and : binary_atomic_op_all_as<atomic_load_and>;
+defm atomic_load_max : binary_atomic_op_all_as<atomic_load_max>;
+defm atomic_load_min : binary_atomic_op_all_as<atomic_load_min>;
+defm atomic_load_or : binary_atomic_op_all_as<atomic_load_or>;
+defm atomic_load_sub : binary_atomic_op_all_as<atomic_load_sub>;
+defm atomic_load_umax : binary_atomic_op_all_as<atomic_load_umax>;
+defm atomic_load_umin : binary_atomic_op_all_as<atomic_load_umin>;
+defm atomic_load_xor : binary_atomic_op_all_as<atomic_load_xor>;
+defm atomic_load_fadd : binary_atomic_op_all_as<atomic_load_fadd, 0>;
let MemoryVT = v2f16 in
-defm atomic_load_fadd_v2f16 : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
-defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op<AMDGPUatomic_cmp_swap>;
+defm atomic_load_fadd_v2f16 : binary_atomic_op_all_as<atomic_load_fadd, 0>;
+defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
- Aligned<8> {
+ Aligned<8> {
let IsLoad = 1;
- let IsNonExtLoad = 1;
}
def load_align16_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
Aligned<16> {
let IsLoad = 1;
- let IsNonExtLoad = 1;
}
def store_align8_local: PatFrag<(ops node:$val, node:$ptr),
(store_local node:$val, node:$ptr)>, Aligned<8> {
let IsStore = 1;
- let IsTruncStore = 0;
}
def store_align16_local: PatFrag<(ops node:$val, node:$ptr),
(store_local node:$val, node:$ptr)>, Aligned<16> {
let IsStore = 1;
- let IsTruncStore = 0;
}
let AddressSpaces = StoreAddress_local.AddrSpaces in {
defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>;
-defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
+defm atomic_cmp_swap_local : ret_noret_ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_local_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>;
}
let AddressSpaces = StoreAddress_region.AddrSpaces in {
-defm atomic_cmp_swap_region : ternary_atomic_op<atomic_cmp_swap>;
-defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
+defm atomic_cmp_swap_region : ret_noret_ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_region_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 645d05aa9238..01a3e78ea48c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -26,6 +26,7 @@
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
#define DEBUG_TYPE "amdgpu-legalinfo"
@@ -134,7 +135,6 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
static LLT getBitcastRegisterType(const LLT Ty) {
const unsigned Size = Ty.getSizeInBits();
- LLT CoercedTy;
if (Size <= 32) {
// <2 x s8> -> s16
// <4 x s8> -> s32
@@ -530,13 +530,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
// Full set of gfx9 features.
- getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+ getActionDefinitionsBuilder({G_ADD, G_SUB})
.legalFor({S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
.minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .maxScalar(0, S32);
+
+ getActionDefinitionsBuilder(G_MUL)
+ .legalFor({S32, S16, V2S16})
.clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
.widenScalarToNextMultipleOf(0, 32)
- .maxScalar(0, S32)
- .scalarize(0);
+ .custom();
+ assert(ST.hasMad64_32());
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
.legalFor({S32, S16, V2S16}) // Clamp modifier
@@ -546,13 +555,21 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(0, 32)
.lower();
} else if (ST.has16BitInsts()) {
- getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+ getActionDefinitionsBuilder({G_ADD, G_SUB})
.legalFor({S32, S16})
.minScalar(0, S16)
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32)
.scalarize(0);
+ getActionDefinitionsBuilder(G_MUL)
+ .legalFor({S32, S16})
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .custom();
+ assert(ST.hasMad64_32());
+
// Technically the saturating operations require clamp bit support, but this
// was introduced at the same time as 16-bit operations.
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
@@ -569,12 +586,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.lower();
} else {
- getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+ getActionDefinitionsBuilder({G_ADD, G_SUB})
.legalFor({S32})
.widenScalarToNextMultipleOf(0, 32)
.clampScalar(0, S32, S32)
.scalarize(0);
+ auto &Mul = getActionDefinitionsBuilder(G_MUL)
+ .legalFor({S32})
+ .scalarize(0)
+ .minScalar(0, S32)
+ .widenScalarToNextMultipleOf(0, 32);
+
+ if (ST.hasMad64_32())
+ Mul.custom();
+ else
+ Mul.maxScalar(0, S32);
+
if (ST.hasIntClamp()) {
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
.legalFor({S32}) // Clamp modifier.
@@ -632,7 +660,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
.legalFor({{S32, S1}, {S32, S32}})
.minScalar(0, S32)
- // TODO: .scalarize(0)
+ .scalarize(0)
.lower();
getActionDefinitionsBuilder(G_BITCAST)
@@ -767,13 +795,24 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.narrowScalarFor({{S64, S16}}, changeTo(0, S32))
.scalarize(0);
- getActionDefinitionsBuilder(G_FSUB)
+ auto &FSubActions = getActionDefinitionsBuilder(G_FSUB);
+ if (ST.has16BitInsts()) {
+ FSubActions
+ // Use actual fsub instruction
+ .legalFor({S32, S16})
+ // Must use fadd + fneg
+ .lowerFor({S64, V2S16});
+ } else {
+ FSubActions
// Use actual fsub instruction
.legalFor({S32})
// Must use fadd + fneg
- .lowerFor({S64, S16, V2S16})
- .scalarize(0)
- .clampScalar(0, S32, S64);
+ .lowerFor({S64, S16, V2S16});
+ }
+
+ FSubActions
+ .scalarize(0)
+ .clampScalar(0, S32, S64);
// Whether this is legal depends on the floating point mode for the function.
auto &FMad = getActionDefinitionsBuilder(G_FMAD);
@@ -839,6 +878,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.lower();
+ getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
+ .customFor({S16, S32})
+ .scalarize(0)
+ .lower();
+
// Lower roundeven into G_FRINT
getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
.scalarize(0)
@@ -1292,6 +1336,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
if (ST.hasGFX90AInsts())
Atomic.legalFor({{S64, LocalPtr}});
+ if (ST.hasGFX940Insts())
+ Atomic.legalFor({{V2S16, LocalPtr}});
}
if (ST.hasAtomicFaddInsts())
Atomic.legalFor({{S32, GlobalPtr}});
@@ -1505,7 +1551,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampMaxNumElements(1, S16, 2) // TODO: Make 4?
.clampMaxNumElements(0, S16, 64);
- // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
+ // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
// pre-legalize.
if (ST.hasVOP3PInsts()) {
getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
@@ -1756,9 +1802,13 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
return legalizeFFloor(MI, MRI, B);
case TargetOpcode::G_BUILD_VECTOR:
return legalizeBuildVector(MI, MRI, B);
+ case TargetOpcode::G_MUL:
+ return legalizeMul(Helper, MI);
case TargetOpcode::G_CTLZ:
case TargetOpcode::G_CTTZ:
return legalizeCTLZ_CTTZ(MI, MRI, B);
+ case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
+ return legalizeFPTruncRound(MI, B);
default:
return false;
}
@@ -1801,6 +1851,39 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
}
+ // TODO: can we be smarter about machine pointer info?
+ MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+ Register LoadAddr = MRI.createGenericVirtualRegister(
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+ // For code object version 5, private_base and shared_base are passed through
+ // implicit kernargs.
+ if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+ AMDGPUTargetLowering::ImplicitParameter Param =
+ AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
+ : AMDGPUTargetLowering::PRIVATE_BASE;
+ uint64_t Offset =
+ ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
+
+ Register KernargPtrReg = MRI.createGenericVirtualRegister(
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+
+ if (!loadInputValue(KernargPtrReg, B,
+ AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
+ return Register();
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo,
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ LLT::scalar(32), commonAlignment(Align(64), Offset));
+
+ // Pointer address
+ B.buildPtrAdd(LoadAddr, KernargPtrReg,
+ B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+ // Load address
+ return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
+ }
+
Register QueuePtr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
@@ -1811,17 +1894,14 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
// private_segment_aperture_base_hi.
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
- // TODO: can we be smarter about machine pointer info?
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo,
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
LLT::scalar(32), commonAlignment(Align(64), StructOffset));
- Register LoadAddr;
-
- B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
+ B.buildPtrAdd(LoadAddr, QueuePtr,
+ B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
}
@@ -1872,31 +1952,9 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
return true;
}
- if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
- // Truncate.
- B.buildExtract(Dst, Src, 0);
- MI.eraseFromParent();
- return true;
- }
-
- if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
- const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- uint32_t AddrHiVal = Info->get32BitAddressHighBits();
-
- // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
- // another. Merge operands are required to be the same type, but creating an
- // extra ptrtoint would be kind of pointless.
- auto HighAddr = B.buildConstant(
- LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
- B.buildMerge(Dst, {Src, HighAddr});
- MI.eraseFromParent();
- return true;
- }
-
- if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
- assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
- DestAS == AMDGPUAS::PRIVATE_ADDRESS);
-
+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
+ (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
+ DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
// Extract low 32-bits of the pointer.
B.buildExtract(Dst, Src, 0);
@@ -1920,37 +1978,70 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
return true;
}
- if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
- return false;
+ if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
+ (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
+ SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
+ if (!ST.hasFlatAddressSpace())
+ return false;
- if (!ST.hasFlatAddressSpace())
- return false;
+ Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
+ if (!ApertureReg.isValid())
+ return false;
- Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
- if (!ApertureReg.isValid())
- return false;
+ // Coerce the type of the low half of the result so we can use merge_values.
+ Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
+
+ // TODO: Should we allow mismatched types but matching sizes in merges to
+ // avoid the ptrtoint?
+ auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
+
+ if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
+ B.buildCopy(Dst, BuildPtr);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
+ auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
- // Coerce the type of the low half of the result so we can use merge_values.
- Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
+ auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
+ SegmentNull.getReg(0));
- // TODO: Should we allow mismatched types but matching sizes in merges to
- // avoid the ptrtoint?
- auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
+ B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
- if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
- B.buildCopy(Dst, BuildPtr);
MI.eraseFromParent();
return true;
}
- auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
- auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
+ if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
+ SrcTy.getSizeInBits() == 64) {
+ // Truncate.
+ B.buildExtract(Dst, Src, 0);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
+ DstTy.getSizeInBits() == 64) {
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ uint32_t AddrHiVal = Info->get32BitAddressHighBits();
- auto CmpRes =
- B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
+ // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
+ // another. Merge operands are required to be the same type, but creating an
+ // extra ptrtoint would be kind of pointless.
+ auto HighAddr = B.buildConstant(
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
+ B.buildMerge(Dst, {Src, HighAddr});
+ MI.eraseFromParent();
+ return true;
+ }
- B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
+ DiagnosticInfoUnsupported InvalidAddrSpaceCast(
+ MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
+ LLVMContext &Ctx = MF.getFunction().getContext();
+ Ctx.diagnose(InvalidAddrSpaceCast);
+ B.buildUndef(Dst);
MI.eraseFromParent();
return true;
}
@@ -2811,6 +2902,298 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
return true;
}
+// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
+//
+// Source and accumulation registers must all be 32-bits.
+//
+// TODO: When the multiply is uniform, we should produce a code sequence
+// that is better suited to instruction selection on the SALU. Instead of
+// the outer loop going over parts of the result, the outer loop should go
+// over parts of one of the factors. This should result in instruction
+// selection that makes full use of S_ADDC_U32 instructions.
+void AMDGPULegalizerInfo::buildMultiply(
+ LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
+ ArrayRef<Register> Src0, ArrayRef<Register> Src1,
+ bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
+ // Use (possibly empty) vectors of S1 registers to represent the set of
+ // carries from one pair of positions to the next.
+ using Carry = SmallVector<Register, 2>;
+
+ MachineIRBuilder &B = Helper.MIRBuilder;
+
+ const LLT S1 = LLT::scalar(1);
+ const LLT S32 = LLT::scalar(32);
+ const LLT S64 = LLT::scalar(64);
+
+ Register Zero32;
+ Register Zero64;
+
+ auto getZero32 = [&]() -> Register {
+ if (!Zero32)
+ Zero32 = B.buildConstant(S32, 0).getReg(0);
+ return Zero32;
+ };
+ auto getZero64 = [&]() -> Register {
+ if (!Zero64)
+ Zero64 = B.buildConstant(S64, 0).getReg(0);
+ return Zero64;
+ };
+
+ // Merge the given carries into the 32-bit LocalAccum, which is modified
+ // in-place.
+ //
+ // Returns the carry-out, which is a single S1 register or null.
+ auto mergeCarry =
+ [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
+ if (CarryIn.empty())
+ return Register();
+
+ bool HaveCarryOut = true;
+ Register CarryAccum;
+ if (CarryIn.size() == 1) {
+ if (!LocalAccum) {
+ LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
+ return Register();
+ }
+
+ CarryAccum = getZero32();
+ } else {
+ CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
+ for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
+ CarryAccum =
+ B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
+ .getReg(0);
+ }
+
+ if (!LocalAccum) {
+ LocalAccum = getZero32();
+ HaveCarryOut = false;
+ }
+ }
+
+ auto Add =
+ B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
+ LocalAccum = Add.getReg(0);
+ return HaveCarryOut ? Add.getReg(1) : Register();
+ };
+
+ // Build a multiply-add chain to compute
+ //
+ // LocalAccum + (partial products at DstIndex)
+ // + (opportunistic subset of CarryIn)
+ //
+ // LocalAccum is an array of one or two 32-bit registers that are updated
+ // in-place. The incoming registers may be null.
+ //
+ // In some edge cases, carry-ins can be consumed "for free". In that case,
+ // the consumed carry bits are removed from CarryIn in-place.
+ auto buildMadChain =
+ [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
+ -> Carry {
+ assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
+ (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
+
+ Carry CarryOut;
+ unsigned j0 = 0;
+
+ // Use plain 32-bit multiplication for the most significant part of the
+ // result by default.
+ if (LocalAccum.size() == 1 &&
+ (!UsePartialMad64_32 || !CarryIn.empty())) {
+ do {
+ unsigned j1 = DstIndex - j0;
+ auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
+ if (!LocalAccum[0]) {
+ LocalAccum[0] = Mul.getReg(0);
+ } else {
+ if (CarryIn.empty()) {
+ LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
+ } else {
+ LocalAccum[0] =
+ B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
+ .getReg(0);
+ CarryIn.pop_back();
+ }
+ }
+ ++j0;
+ } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
+ }
+
+ // Build full 64-bit multiplies.
+ if (j0 <= DstIndex) {
+ bool HaveSmallAccum = false;
+ Register Tmp;
+
+ if (LocalAccum[0]) {
+ if (LocalAccum.size() == 1) {
+ Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
+ HaveSmallAccum = true;
+ } else if (LocalAccum[1]) {
+ Tmp = B.buildMerge(S64, LocalAccum).getReg(0);
+ HaveSmallAccum = false;
+ } else {
+ Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
+ HaveSmallAccum = true;
+ }
+ } else {
+ assert(LocalAccum.size() == 1 || !LocalAccum[1]);
+ Tmp = getZero64();
+ HaveSmallAccum = true;
+ }
+
+ do {
+ unsigned j1 = DstIndex - j0;
+ auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
+ {Src0[j0], Src1[j1], Tmp});
+ Tmp = Mad.getReg(0);
+ if (!HaveSmallAccum)
+ CarryOut.push_back(Mad.getReg(1));
+ HaveSmallAccum = false;
+ ++j0;
+ } while (j0 <= DstIndex);
+
+ auto Unmerge = B.buildUnmerge(S32, Tmp);
+ LocalAccum[0] = Unmerge.getReg(0);
+ if (LocalAccum.size() > 1)
+ LocalAccum[1] = Unmerge.getReg(1);
+ }
+
+ return CarryOut;
+ };
+
+ // Outer multiply loop, iterating over destination parts from least
+ // significant to most significant parts.
+ //
+ // The columns of the following diagram correspond to the destination parts
+ // affected by one iteration of the outer loop (ignoring boundary
+ // conditions).
+ //
+ // Dest index relative to 2 * i: 1 0 -1
+ // ------
+ // Carries from previous iteration: e o
+ // Even-aligned partial product sum: E E .
+ // Odd-aligned partial product sum: O O
+ //
+ // 'o' is OddCarry, 'e' is EvenCarry.
+ // EE and OO are computed from partial products via buildMadChain and use
+ // accumulation where possible and appropriate.
+ //
+ Register SeparateOddCarry;
+ Carry EvenCarry;
+ Carry OddCarry;
+
+ for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
+ Carry OddCarryIn = std::move(OddCarry);
+ Carry EvenCarryIn = std::move(EvenCarry);
+ OddCarry.clear();
+ EvenCarry.clear();
+
+ // Partial products at offset 2 * i.
+ if (2 * i < Accum.size()) {
+ auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
+ EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
+ }
+
+ // Partial products at offset 2 * i - 1.
+ if (i > 0) {
+ if (!SeparateOddAlignedProducts) {
+ auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
+ OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
+ } else {
+ bool IsHighest = 2 * i >= Accum.size();
+ Register SeparateOddOut[2];
+ auto LocalAccum = makeMutableArrayRef(SeparateOddOut)
+ .take_front(IsHighest ? 1 : 2);
+ OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
+
+ MachineInstr *Lo;
+
+ if (i == 1) {
+ if (!IsHighest)
+ Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
+ else
+ Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
+ } else {
+ Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
+ SeparateOddCarry);
+ }
+ Accum[2 * i - 1] = Lo->getOperand(0).getReg();
+
+ if (!IsHighest) {
+ auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
+ Lo->getOperand(1).getReg());
+ Accum[2 * i] = Hi.getReg(0);
+ SeparateOddCarry = Hi.getReg(1);
+ }
+ }
+ }
+
+ // Add in the carries from the previous iteration
+ if (i > 0) {
+ if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
+ EvenCarryIn.push_back(CarryOut);
+
+ if (2 * i < Accum.size()) {
+ if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
+ OddCarry.push_back(CarryOut);
+ }
+ }
+ }
+}
+
+// Custom narrowing of wide multiplies using wide multiply-add instructions.
+//
+// TODO: If the multiply is followed by an addition, we should attempt to
+// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
+bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
+ assert(ST.hasMad64_32());
+ assert(MI.getOpcode() == TargetOpcode::G_MUL);
+
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *B.getMRI();
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(1).getReg();
+ Register Src1 = MI.getOperand(2).getReg();
+
+ LLT Ty = MRI.getType(DstReg);
+ assert(Ty.isScalar());
+
+ unsigned Size = Ty.getSizeInBits();
+ unsigned NumParts = Size / 32;
+ assert((Size % 32) == 0);
+ assert(NumParts >= 2);
+
+ // Whether to use MAD_64_32 for partial products whose high half is
+ // discarded. This avoids some ADD instructions but risks false dependency
+ // stalls on some subtargets in some cases.
+ const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
+
+ // Whether to compute odd-aligned partial products separately. This is
+ // advisable on subtargets where the accumulator of MAD_64_32 must be placed
+ // in an even-aligned VGPR.
+ const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
+
+ LLT S32 = LLT::scalar(32);
+ SmallVector<Register, 2> Src0Parts, Src1Parts;
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
+ Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
+ }
+ B.buildUnmerge(Src0Parts, Src0);
+ B.buildUnmerge(Src1Parts, Src1);
+
+ SmallVector<Register, 2> AccumRegs(NumParts);
+ buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
+ SeparateOddAlignedProducts);
+
+ B.buildMerge(DstReg, AccumRegs);
+ MI.eraseFromParent();
+ return true;
+
+}
+
// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
// case with a single min instruction instead of a compare+select.
@@ -2954,6 +3337,89 @@ bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
return true;
}
+static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
+ int64_t C) {
+ B.buildConstant(MI.getOperand(0).getReg(), C);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+ unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
+ unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
+ if (MaxID == 0)
+ return replaceWithConstant(B, MI, 0);
+
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ const ArgDescriptor *Arg;
+ const TargetRegisterClass *ArgRC;
+ LLT ArgTy;
+ std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ if (!Arg) {
+ // It's undefined behavior if a function marked with the amdgpu-no-*
+ // attributes uses the corresponding intrinsic.
+ B.buildUndef(DstReg);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ if (Arg->isMasked()) {
+ // Don't bother inserting AssertZext for packed IDs since we're emitting the
+ // masking operations anyway.
+ //
+ // TODO: We could assert the top bit is 0 for the source copy.
+ if (!loadInputValue(DstReg, B, ArgType))
+ return false;
+ } else {
+ Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+ if (!loadInputValue(TmpReg, B, ArgType))
+ return false;
+ B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
+Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
+ int64_t Offset) const {
+ LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+ Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
+
+ // TODO: If we passed in the base kernel offset we could have a better
+ // alignment than 4, but we don't really need it.
+ if (!loadInputValue(KernArgReg, B,
+ AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
+ llvm_unreachable("failed to find kernarg segment ptr");
+
+ auto COffset = B.buildConstant(LLT::scalar(64), Offset);
+ // TODO: Should get nuw
+ return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
+}
+
+/// Legalize a value that's loaded from kernel arguments. This is only used by
+/// legacy intrinsics.
+bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
+ MachineIRBuilder &B,
+ uint64_t Offset,
+ Align Alignment) const {
+ Register DstReg = MI.getOperand(0).getReg();
+
+ assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
+ "unexpected kernarg parameter type");
+
+ Register Ptr = getKernargParameterPtr(B, Offset);
+ MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+ B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -3688,9 +4154,9 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
// The remaining operands were used to set fields in the MemOperand on
// construction.
for (int I = 6; I > 3; --I)
- MI.RemoveOperand(I);
+ MI.removeOperand(I);
- MI.RemoveOperand(1); // Remove the intrinsic ID.
+ MI.removeOperand(1); // Remove the intrinsic ID.
Observer.changedInstr(MI);
return true;
}
@@ -4359,7 +4825,7 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
///
/// We don't want to directly select image instructions just yet, but also want
/// to exposes all register repacking to the legalizer/combiners. We also don't
-/// want a selected instrution entering RegBankSelect. In order to avoid
+/// want a selected instruction entering RegBankSelect. In order to avoid
/// defining a multitude of intermediate image instructions, directly hack on
/// the intrinsic's arguments. In cases like a16 addresses, this requires
/// padding now unnecessary arguments with $noreg.
@@ -4508,6 +4974,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
//
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
+ //
+ // TODO: we can actually allow partial NSA where the final register is a
+ // contiguous set of the remaining addresses.
+ // This could help where there are more addresses than supported.
const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
CorrectedNumVAddrs <= ST.getNSAMaxSize();
@@ -4607,7 +5077,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
return false;
// TODO: Make sure the TFE operand bit is set.
- MI.RemoveOperand(1);
+ MI.removeOperand(1);
// Handle the easy case that requires no repack instructions.
if (Ty == S32) {
@@ -4737,7 +5207,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
// should be fixed to have a memory operand. Since it's readnone, we're not
// allowed to add one.
MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
- MI.RemoveOperand(1); // Remove intrinsic ID
+ MI.removeOperand(1); // Remove intrinsic ID
// FIXME: When intrinsic definition is fixed, this should have an MMO already.
// TODO: Should this use datalayout alignment?
@@ -4797,6 +5267,47 @@ bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+ MachineFunction &MF = B.getMF();
+ const LLT S64 = LLT::scalar(64);
+
+ Register SGPR01(AMDGPU::SGPR0_SGPR1);
+ // For code object version 5, queue_ptr is passed through implicit kernarg.
+ if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+ AMDGPUTargetLowering::ImplicitParameter Param =
+ AMDGPUTargetLowering::QUEUE_PTR;
+ uint64_t Offset =
+ ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
+
+ Register KernargPtrReg = MRI.createGenericVirtualRegister(
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+
+ if (!loadInputValue(KernargPtrReg, B,
+ AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
+ return false;
+
+ // TODO: can we be smarter about machine pointer info?
+ MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo,
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ LLT::scalar(64), commonAlignment(Align(64), Offset));
+
+ // Pointer address
+ Register LoadAddr = MRI.createGenericVirtualRegister(
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+ B.buildPtrAdd(LoadAddr, KernargPtrReg,
+ B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+ // Load address
+ Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
+ B.buildCopy(SGPR01, Temp);
+ B.buildInstr(AMDGPU::S_TRAP)
+ .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
+ .addReg(SGPR01, RegState::Implicit);
+ MI.eraseFromParent();
+ return true;
+ }
+
// Pass queue pointer to trap handler as input, and insert trap instruction
// Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
Register LiveIn =
@@ -4804,7 +5315,6 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
return false;
- Register SGPR01(AMDGPU::SGPR0_SGPR1);
B.buildCopy(SGPR01, LiveIn);
B.buildInstr(AMDGPU::S_TRAP)
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
@@ -4848,6 +5358,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI = *B.getMRI();
const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
+ const LLT V3S32 = LLT::fixed_vector(3, 32);
Register DstReg = MI.getOperand(0).getReg();
Register NodePtr = MI.getOperand(2).getReg();
@@ -4865,61 +5377,98 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
return false;
}
+ const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
const unsigned NumVDataDwords = 4;
const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
- const bool UseNSA =
- ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
+ const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
+ const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
const unsigned BaseOpcodes[2][2] = {
{AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
{AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
int Opcode;
if (UseNSA) {
- Opcode =
- AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
- NumVDataDwords, NumVAddrDwords);
- } else {
Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
- AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
- PowerOf2Ceil(NumVAddrDwords));
+ IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
+ : AMDGPU::MIMGEncGfx10NSA,
+ NumVDataDwords, NumVAddrDwords);
+ } else {
+ Opcode = AMDGPU::getMIMGOpcode(
+ BaseOpcodes[Is64][IsA16],
+ IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
+ NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
}
assert(Opcode != -1);
SmallVector<Register, 12> Ops;
- if (Is64) {
- auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
- Ops.push_back(Unmerge.getReg(0));
- Ops.push_back(Unmerge.getReg(1));
- } else {
- Ops.push_back(NodePtr);
- }
- Ops.push_back(RayExtent);
+ if (UseNSA && IsGFX11Plus) {
+ auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
+ auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
+ auto Merged = B.buildMerge(
+ V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
+ Ops.push_back(Merged.getReg(0));
+ };
- auto packLanes = [&Ops, &S32, &B](Register Src) {
- auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
- Ops.push_back(Unmerge.getReg(0));
- Ops.push_back(Unmerge.getReg(1));
- Ops.push_back(Unmerge.getReg(2));
- };
+ Ops.push_back(NodePtr);
+ Ops.push_back(RayExtent);
+ packLanes(RayOrigin);
- packLanes(RayOrigin);
- if (IsA16) {
- auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
- auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
- Register R1 = MRI.createGenericVirtualRegister(S32);
- Register R2 = MRI.createGenericVirtualRegister(S32);
- Register R3 = MRI.createGenericVirtualRegister(S32);
- B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
- B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
- B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
- Ops.push_back(R1);
- Ops.push_back(R2);
- Ops.push_back(R3);
+ if (IsA16) {
+ auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
+ auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
+ auto MergedDir = B.buildMerge(
+ V3S32,
+ {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0),
+ UnmergeRayDir.getReg(0)}))
+ .getReg(0),
+ B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1),
+ UnmergeRayDir.getReg(1)}))
+ .getReg(0),
+ B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2),
+ UnmergeRayDir.getReg(2)}))
+ .getReg(0)});
+ Ops.push_back(MergedDir.getReg(0));
+ } else {
+ packLanes(RayDir);
+ packLanes(RayInvDir);
+ }
} else {
- packLanes(RayDir);
- packLanes(RayInvDir);
+ if (Is64) {
+ auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
+ Ops.push_back(Unmerge.getReg(0));
+ Ops.push_back(Unmerge.getReg(1));
+ } else {
+ Ops.push_back(NodePtr);
+ }
+ Ops.push_back(RayExtent);
+
+ auto packLanes = [&Ops, &S32, &B](Register Src) {
+ auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
+ Ops.push_back(Unmerge.getReg(0));
+ Ops.push_back(Unmerge.getReg(1));
+ Ops.push_back(Unmerge.getReg(2));
+ };
+
+ packLanes(RayOrigin);
+ if (IsA16) {
+ auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
+ auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
+ Register R1 = MRI.createGenericVirtualRegister(S32);
+ Register R2 = MRI.createGenericVirtualRegister(S32);
+ Register R3 = MRI.createGenericVirtualRegister(S32);
+ B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
+ B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
+ B.buildMerge(R3,
+ {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
+ Ops.push_back(R1);
+ Ops.push_back(R2);
+ Ops.push_back(R3);
+ } else {
+ packLanes(RayDir);
+ packLanes(RayInvDir);
+ }
}
if (!UseNSA) {
@@ -4946,9 +5495,24 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
return true;
}
-static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
- B.buildConstant(MI.getOperand(0).getReg(), C);
+bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ unsigned Opc;
+ int RoundMode = MI.getOperand(2).getImm();
+
+ if (RoundMode == (int)RoundingMode::TowardPositive)
+ Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
+ else if (RoundMode == (int)RoundingMode::TowardNegative)
+ Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
+ else
+ return false;
+
+ B.buildInstr(Opc)
+ .addDef(MI.getOperand(0).getReg())
+ .addUse(MI.getOperand(1).getReg());
+
MI.eraseFromParent();
+
return true;
}
@@ -5055,22 +5619,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_implicitarg_ptr:
return legalizeImplicitArgPtr(MI, MRI, B);
case Intrinsic::amdgcn_workitem_id_x:
- if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
- return replaceWithConstant(B, MI, 0);
- return legalizePreloadedArgIntrin(MI, MRI, B,
- AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+ return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_X);
case Intrinsic::amdgcn_workitem_id_y:
- if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
- return replaceWithConstant(B, MI, 0);
-
- return legalizePreloadedArgIntrin(MI, MRI, B,
- AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+ return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
case Intrinsic::amdgcn_workitem_id_z:
- if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
- return replaceWithConstant(B, MI, 0);
-
- return legalizePreloadedArgIntrin(MI, MRI, B,
- AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+ return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
case Intrinsic::amdgcn_workgroup_id_x:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
@@ -5092,6 +5648,31 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_dispatch_id:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::DISPATCH_ID);
+ case Intrinsic::r600_read_ngroups_x:
+ // TODO: Emit error for hsa
+ return legalizeKernargMemParameter(MI, B,
+ SI::KernelInputOffsets::NGROUPS_X);
+ case Intrinsic::r600_read_ngroups_y:
+ return legalizeKernargMemParameter(MI, B,
+ SI::KernelInputOffsets::NGROUPS_Y);
+ case Intrinsic::r600_read_ngroups_z:
+ return legalizeKernargMemParameter(MI, B,
+ SI::KernelInputOffsets::NGROUPS_Z);
+ case Intrinsic::r600_read_local_size_x:
+ // TODO: Could insert G_ASSERT_ZEXT from s16
+ return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
+ case Intrinsic::r600_read_local_size_y:
+ // TODO: Could insert G_ASSERT_ZEXT from s16
+ return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y);
+ // TODO: Could insert G_ASSERT_ZEXT from s16
+ case Intrinsic::r600_read_local_size_z:
+ return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
+ case Intrinsic::r600_read_global_size_x:
+ return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
+ case Intrinsic::r600_read_global_size_y:
+ return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
+ case Intrinsic::r600_read_global_size_z:
+ return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
case Intrinsic::amdgcn_fdiv_fast:
return legalizeFDIVFastIntrin(MI, MRI, B);
case Intrinsic::amdgcn_is_shared:
@@ -5157,7 +5738,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
Register DstReg = MI.getOperand(0).getReg();
- if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
+ if (!MRI.use_empty(DstReg) &&
+ !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) {
Function &F = B.getMF().getFunction();
DiagnosticInfoUnsupported NoFpRet(
F, "return versions of fp atomics not supported", B.getDebugLoc(),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 964a41d3d740..cee533aa34ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -88,6 +88,12 @@ public:
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+
+ void buildMultiply(LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
+ ArrayRef<Register> Src0, ArrayRef<Register> Src1,
+ bool UsePartialMad64_32,
+ bool SeparateOddAlignedProducts) const;
+ bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
@@ -96,9 +102,18 @@ public:
const TargetRegisterClass *ArgRC, LLT ArgTy) const;
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+
bool legalizePreloadedArgIntrin(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+ bool legalizeWorkitemIDIntrinsic(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+ unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+
+ Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const;
+ bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B,
+ uint64_t Offset,
+ Align Alignment = Align(4)) const;
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
@@ -169,6 +184,8 @@ public:
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
+
bool legalizeImageIntrinsic(
MachineInstr &MI, MachineIRBuilder &B,
GISelChangeObserver &Observer,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index bbbadfdfd444..78e092b2e872 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1593,8 +1593,9 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
// max vector size is 16, and sincos will generate two results.
double DVal0[16], DVal1[16];
+ int FuncVecSize = getVecSize(FInfo);
bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
- if (getVecSize(FInfo) == 1) {
+ if (FuncVecSize == 1) {
if (!evaluateScalarMathFunc(FInfo, DVal0[0],
DVal1[0], copr0, copr1, copr2)) {
return false;
@@ -1603,7 +1604,7 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(copr0);
ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(copr1);
ConstantDataVector *CDV2 = dyn_cast_or_null<ConstantDataVector>(copr2);
- for (int i=0; i < getVecSize(FInfo); ++i) {
+ for (int i = 0; i < FuncVecSize; ++i) {
Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
Constant *celt2 = CDV2 ? CDV2->getElementAsConstant(i) : nullptr;
@@ -1616,19 +1617,19 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
LLVMContext &context = CI->getParent()->getParent()->getContext();
Constant *nval0, *nval1;
- if (getVecSize(FInfo) == 1) {
+ if (FuncVecSize == 1) {
nval0 = ConstantFP::get(CI->getType(), DVal0[0]);
if (hasTwoResults)
nval1 = ConstantFP::get(CI->getType(), DVal1[0]);
} else {
if (getArgType(FInfo) == AMDGPULibFunc::F32) {
SmallVector <float, 0> FVal0, FVal1;
- for (int i=0; i < getVecSize(FInfo); ++i)
+ for (int i = 0; i < FuncVecSize; ++i)
FVal0.push_back((float)DVal0[i]);
ArrayRef<float> tmp0(FVal0);
nval0 = ConstantDataVector::get(context, tmp0);
if (hasTwoResults) {
- for (int i=0; i < getVecSize(FInfo); ++i)
+ for (int i = 0; i < FuncVecSize; ++i)
FVal1.push_back((float)DVal1[i]);
ArrayRef<float> tmp1(FVal1);
nval1 = ConstantDataVector::get(context, tmp1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
index dc0ac72016f3..bf0fda25b2c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -324,8 +324,8 @@ public:
class AMDGPULibFuncImpl : public AMDGPULibFuncBase {
public:
- AMDGPULibFuncImpl() {}
- virtual ~AMDGPULibFuncImpl() {}
+ AMDGPULibFuncImpl() = default;
+ virtual ~AMDGPULibFuncImpl() = default;
/// Get unmangled name for mangled library function and name for unmangled
/// library function.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index b700dd5aa301..93d1eed2cf63 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -13,7 +13,6 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/CommandLine.h"
@@ -156,11 +155,8 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
Changed = true;
break;
- case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::r600_read_tidig_x:
- case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
- case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::r600_read_tidig_z:
case Intrinsic::r600_read_local_size_x:
case Intrinsic::r600_read_local_size_y:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index c34c12ab9fec..2e5c35f1f571 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -73,7 +73,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
Align MaxAlign;
- // FIXME: Alignment is broken broken with explicit arg offset.;
+ // FIXME: Alignment is broken with explicit arg offset.;
const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
if (TotalKernArgSize == 0)
return false;
@@ -92,9 +92,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
for (Argument &Arg : F.args()) {
const bool IsByRef = Arg.hasByRefAttr();
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
- MaybeAlign ABITypeAlign = IsByRef ? Arg.getParamAlign() : None;
- if (!ABITypeAlign)
- ABITypeAlign = DL.getABITypeAlign(ArgTy);
+ MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : None;
+ Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
uint64_t Size = DL.getTypeSizeInBits(ArgTy);
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 08a1b970648d..f5903b3afb81 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -163,39 +163,29 @@ static bool processUse(CallInst *CI) {
if (!GroupSize || !GridSize)
continue;
+ using namespace llvm::PatternMatch;
+ auto GroupIDIntrin =
+ I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
+ : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
+ : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+
for (User *U : GroupSize->users()) {
auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
if (!ZextGroupSize)
continue;
- for (User *ZextUser : ZextGroupSize->users()) {
- auto *SI = dyn_cast<SelectInst>(ZextUser);
- if (!SI)
- continue;
-
- using namespace llvm::PatternMatch;
- auto GroupIDIntrin = I == 0 ?
- m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() :
- (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() :
- m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
-
- auto SubExpr = m_Sub(m_Specific(GridSize),
- m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize)));
-
- ICmpInst::Predicate Pred;
- if (match(SI,
- m_Select(m_ICmp(Pred, SubExpr, m_Specific(ZextGroupSize)),
- SubExpr,
- m_Specific(ZextGroupSize))) &&
- Pred == ICmpInst::ICMP_ULT) {
+ for (User *UMin : ZextGroupSize->users()) {
+ if (match(UMin,
+ m_UMin(m_Sub(m_Specific(GridSize),
+ m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),
+ m_Specific(ZextGroupSize)))) {
if (HasReqdWorkGroupSize) {
ConstantInt *KnownSize
= mdconst::extract<ConstantInt>(MD->getOperand(I));
- SI->replaceAllUsesWith(ConstantExpr::getIntegerCast(KnownSize,
- SI->getType(),
- false));
+ UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast(
+ KnownSize, UMin->getType(), false));
} else {
- SI->replaceAllUsesWith(ZextGroupSize);
+ UMin->replaceAllUsesWith(ZextGroupSize);
}
MadeChange = true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 6e2b5dc471bc..35922341de26 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -14,7 +14,7 @@
// known address. AMDGPUMachineFunction allocates the LDS global.
//
// Local variables with constant annotation or non-undef initializer are passed
-// through unchanged for simplication or error diagnostics in later passes.
+// through unchanged for simplification or error diagnostics in later passes.
//
// To reduce the memory overhead variables that are only used by kernels are
// excluded from this transform. The analysis to determine whether a variable
@@ -28,8 +28,9 @@
#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "Utils/AMDGPULDSUtils.h"
+#include "Utils/AMDGPUMemoryUtils.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/CallGraph.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/IRBuilder.h"
@@ -163,9 +164,10 @@ public:
}
bool runOnModule(Module &M) override {
+ CallGraph CG = CallGraph(M);
UsedList = getUsedList(M);
bool Changed = superAlignLDSGlobals(M);
- Changed |= processUsedLDS(M);
+ Changed |= processUsedLDS(CG, M);
for (Function &F : M.functions()) {
if (F.isDeclaration())
@@ -174,7 +176,7 @@ public:
// Only lower compute kernels' LDS.
if (!AMDGPU::isKernel(F.getCallingConv()))
continue;
- Changed |= processUsedLDS(M, &F);
+ Changed |= processUsedLDS(CG, M, &F);
}
UsedList.clear();
@@ -226,7 +228,7 @@ private:
return Changed;
}
- bool processUsedLDS(Module &M, Function *F = nullptr) {
+ bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) {
LLVMContext &Ctx = M.getContext();
const DataLayout &DL = M.getDataLayout();
@@ -374,7 +376,20 @@ private:
IRBuilder<> Builder(Ctx);
for (Function &Func : M.functions()) {
if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
- markUsedByKernel(Builder, &Func, SGV);
+ const CallGraphNode *N = CG[&Func];
+ const bool CalleesRequireModuleLDS = N->size() > 0;
+
+ if (CalleesRequireModuleLDS) {
+ // If a function this kernel might call requires module LDS,
+ // annotate the kernel to let later passes know it will allocate
+ // this structure, even if not apparent from the IR.
+ markUsedByKernel(Builder, &Func, SGV);
+ } else {
+ // However if we are certain this kernel cannot call a function that
+ // requires module LDS, annotate the kernel so the backend can elide
+ // the allocation without repeating callgraph walks.
+ Func.addFnAttr("amdgpu-elide-module-lds");
+ }
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 3fad7e192195..ed6ddbf426fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -120,8 +120,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
// FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
// need to select it to the subtarget specific version, and there's no way to
// do that with a single pseudo source operation.
- if (Opcode == AMDGPU::S_SETPC_B64_return ||
- Opcode == AMDGPU::S_SETPC_B64_return_gfx)
+ if (Opcode == AMDGPU::S_SETPC_B64_return)
Opcode = AMDGPU::S_SETPC_B64;
else if (Opcode == AMDGPU::SI_CALL) {
// SI_CALL is just S_SWAPPC_B64 with an additional operand to track the
@@ -208,6 +207,16 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
return;
}
+ if (MI->getOpcode() == AMDGPU::SCHED_BARRIER) {
+ if (isVerbose()) {
+ std::string HexString;
+ raw_string_ostream HexStream(HexString);
+ HexStream << format_hex(MI->getOperand(0).getImm(), 10, true);
+ OutStreamer->emitRawComment(" sched_barrier mask(" + HexString + ")");
+ }
+ return;
+ }
+
if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) {
if (isVerbose())
OutStreamer->emitRawComment(" divergent unreachable");
@@ -240,7 +249,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
raw_svector_ostream CodeStream(CodeBytes);
std::unique_ptr<MCCodeEmitter> InstEmitter(createSIMCCodeEmitter(
- *STI.getInstrInfo(), *OutContext.getRegisterInfo(), OutContext));
+ *STI.getInstrInfo(), OutContext));
InstEmitter->encodeInstruction(TmpInst, CodeStream, Fixups, STI);
assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
index 0e43b4fe9461..5c656f158e71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -1,4 +1,4 @@
-//===- AMDGPUMCInstLower.h - Lower AMDGPU MachineInstr to an MCInst -------===//
+//===- AMDGPUMCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*--===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
index c3441f81a78e..0712466a0e88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
@@ -21,17 +21,18 @@ bool AMDGPUMIRFormatter::parseCustomPseudoSourceValue(
StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SIInstrInfo &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+ const AMDGPUTargetMachine &TM =
+ static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
if (Src == "BufferResource") {
- PSV = MFI->getBufferPSV(TII);
+ PSV = MFI->getBufferPSV(TM);
return false;
}
if (Src == "ImageResource") {
- PSV = MFI->getImagePSV(TII);
+ PSV = MFI->getImagePSV(TM);
return false;
}
if (Src == "GWSResource") {
- PSV = MFI->getGWSPSV(TII);
+ PSV = MFI->getGWSPSV(TM);
return false;
}
llvm_unreachable("unknown MIR custom pseudo source value");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index 47faa6c72481..753f7edc9385 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -25,7 +25,7 @@ struct PerFunctionMIParsingState;
class AMDGPUMIRFormatter final : public MIRFormatter {
public:
- AMDGPUMIRFormatter() {}
+ AMDGPUMIRFormatter() = default;
virtual ~AMDGPUMIRFormatter() = default;
/// Implement target specific parsing of target custom pseudo source value.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 4e2f98d2a5db..d837f8cb2f60 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -1295,7 +1295,7 @@ static void fixRegionTerminator(RegionMRT *Region) {
}
}
-// If a region region is just a sequence of regions (and the exit
+// If a region is just a sequence of regions (and the exit
// block in the case of the top level region), we can simply skip
// linearizing it, because it is already linear
bool regionIsSequence(RegionMRT *Region) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 593388a4d819..b461c3c4bfdc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMachineFunction.h"
+#include "AMDGPU.h"
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPUSubtarget.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
@@ -32,6 +33,15 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");
WaveLimiter = WaveLimitAttr.getValueAsBool();
+ // FIXME: How is this attribute supposed to interact with statically known
+ // global sizes?
+ StringRef S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
+ if (!S.empty())
+ S.consumeInteger(0, GDSSize);
+
+ // Assume the attribute allocates before any known GDS globals.
+ StaticGDSSize = GDSSize;
+
CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
@@ -46,25 +56,43 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
Align Alignment =
DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
- /// TODO: We should sort these to minimize wasted space due to alignment
- /// padding. Currently the padding is decided by the first encountered use
- /// during lowering.
- unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
+ unsigned Offset;
+ if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ /// TODO: We should sort these to minimize wasted space due to alignment
+ /// padding. Currently the padding is decided by the first encountered use
+ /// during lowering.
+ Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
- Entry.first->second = Offset;
- StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
+ StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
- // Update the LDS size considering the padding to align the dynamic shared
- // memory.
- LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
+ // Update the LDS size considering the padding to align the dynamic shared
+ // memory.
+ LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
+ } else {
+ assert(GV.getAddressSpace() == AMDGPUAS::REGION_ADDRESS &&
+ "expected region address space");
+ Offset = StaticGDSSize = alignTo(StaticGDSSize, Alignment);
+ StaticGDSSize += DL.getTypeAllocSize(GV.getValueType());
+
+ // FIXME: Apply alignment of dynamic GDS
+ GDSSize = StaticGDSSize;
+ }
+
+ Entry.first->second = Offset;
return Offset;
}
-void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
+// This kernel calls no functions that require the module lds struct
+static bool canElideModuleLDS(const Function &F) {
+ return F.hasFnAttribute("amdgpu-elide-module-lds");
+}
+
+void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) {
+ const Module *M = F.getParent();
if (isModuleEntryFunction()) {
const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
- if (GV) {
+ if (GV && !canElideModuleLDS(F)) {
unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
(void)Offset;
assert(Offset == 0 &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 48cf46b5f871..df62c2314617 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -12,6 +12,10 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Function.h"
namespace llvm {
@@ -25,11 +29,13 @@ protected:
Align MaxKernArgAlign; // Cache for this.
/// Number of bytes in the LDS that are being used.
- unsigned LDSSize = 0;
+ uint32_t LDSSize = 0;
+ uint32_t GDSSize = 0;
/// Number of bytes in the LDS allocated statically. This field is only used
/// in the instruction selector and not part of the machine function info.
- unsigned StaticLDSSize = 0;
+ uint32_t StaticLDSSize = 0;
+ uint32_t StaticGDSSize = 0;
/// Align for dynamic shared memory if any. Dynamic shared memory is
/// allocated directly after the static one, i.e., LDSSize. Need to pad
@@ -63,12 +69,16 @@ public:
return ExplicitKernArgSize;
}
- unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); }
+ Align getMaxKernArgAlign() const { return MaxKernArgAlign; }
- unsigned getLDSSize() const {
+ uint32_t getLDSSize() const {
return LDSSize;
}
+ uint32_t getGDSSize() const {
+ return GDSSize;
+ }
+
AMDGPU::SIModeRegisterDefaults getMode() const {
return Mode;
}
@@ -92,7 +102,7 @@ public:
}
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
- void allocateModuleLDSGlobal(const Module *M);
+ void allocateModuleLDSGlobal(const Function &F);
Align getDynLDSAlign() const { return DynLDSAlign; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index 6646cce8186b..2d48be9ea542 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMachineModuleInfo.h"
+#include "llvm/MC/MCSymbol.h"
namespace llvm {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index 5a5a5d213a1a..fb7709d66c76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -34,6 +34,7 @@
#include "AMDGPU.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Mangler.h"
#include "llvm/IR/Module.h"
@@ -71,7 +72,7 @@ ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
return new AMDGPUOpenCLEnqueuedBlockLowering();
}
-/// Collect direct or indrect callers of \p F and save them
+/// Collect direct or indirect callers of \p F and save them
/// to \p Callers.
static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
for (auto U : F->users()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 8ad344816ad2..09dbd2150db6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -116,7 +116,6 @@ private:
bool isGlobalAddr(const Value *V) const;
bool isLocalAddr(const Value *V) const;
- bool isConstantAddr(const Value *V) const;
};
static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
@@ -153,7 +152,7 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
if (auto LD = dyn_cast<LoadInst>(V)) {
auto M = LD->getPointerOperand();
- if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) {
+ if (isGlobalAddr(M)) {
LLVM_DEBUG(dbgs() << " is IA\n");
return true;
}
@@ -267,19 +266,23 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
<< " LSMInst cost: " << Info->LSMInstCost << '\n'
<< " TotalInst cost: " << Info->InstCost << '\n');
+ bool Changed = false;
+
if (isMemBound(*Info)) {
LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
NumMemBound++;
F.addFnAttr("amdgpu-memory-bound", "true");
+ Changed = true;
}
if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {
LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
NumLimitWave++;
F.addFnAttr("amdgpu-wave-limiter", "true");
+ Changed = true;
}
- return true;
+ return Changed;
}
bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
@@ -332,15 +335,6 @@ AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
return MAI;
}
-bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
- if (auto PT = dyn_cast<PointerType>(V->getType())) {
- unsigned As = PT->getAddressSpace();
- return As == AMDGPUAS::CONSTANT_ADDRESS ||
- As == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
- }
- return false;
-}
-
bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
MemAccessInfo &Reference) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index c029046ab65f..bfe2e9b66ed4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -16,6 +16,7 @@
#include "AMDGPULegalizerInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -125,7 +126,6 @@ void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
LLT::scalar(64));
const LLT S32 = LLT::scalar(32);
- B.setMBB(*MI.getParent());
B.setInstrAndDebugLoc(MI);
auto Unmerge = B.buildUnmerge(S32, Src);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index f91f31508ad2..1db7c18e4598 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -19,6 +19,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/Dominators.h"
@@ -66,7 +67,7 @@ private:
Value *simplify(Instruction *I, const TargetLibraryInfo *TLI,
const DominatorTree *DT) {
- return SimplifyInstruction(I, {*TD, TLI, DT});
+ return simplifyInstruction(I, {*TD, TLI, DT});
}
const DataLayout *TD;
@@ -562,15 +563,6 @@ bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) {
if (Printfs.empty())
return false;
- if (auto HostcallFunction = M.getFunction("__ockl_hostcall_internal")) {
- for (auto &U : HostcallFunction->uses()) {
- if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
- M.getContext().emitError(
- CI, "Cannot use both printf and hostcall in the same module");
- }
- }
- }
-
TD = &M.getDataLayout();
return lowerPrintfForGpu(M);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 99b7ffb33884..5a4426ba8113 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -334,86 +334,49 @@ static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
ArrayTy->getNumElements());
}
-static Value *stripBitcasts(Value *V) {
- while (Instruction *I = dyn_cast<Instruction>(V)) {
- if (I->getOpcode() != Instruction::BitCast)
- break;
- V = I->getOperand(0);
- }
- return V;
-}
-
static Value *
calculateVectorIndex(Value *Ptr,
const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
- GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr));
+ auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
if (!GEP)
- return nullptr;
+ return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
auto I = GEPIdx.find(GEP);
- return I == GEPIdx.end() ? nullptr : I->second;
+ assert(I != GEPIdx.end() && "Must have entry for GEP!");
+ return I->second;
}
-static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
- // FIXME we only support simple cases
- if (GEP->getNumOperands() != 3)
+static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
+ Type *VecElemTy, const DataLayout &DL) {
+ // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
+ // helper.
+ unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
+ MapVector<Value *, APInt> VarOffsets;
+ APInt ConstOffset(BW, 0);
+ if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
+ !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
return nullptr;
- ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
- if (!I0 || !I0->isZero())
+ unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
+ if (VarOffsets.size() > 1)
return nullptr;
- return GEP->getOperand(2);
-}
-
-// Not an instruction handled below to turn into a vector.
-//
-// TODO: Check isTriviallyVectorizable for calls and handle other
-// instructions.
-static bool canVectorizeInst(Instruction *Inst, User *User,
- const DataLayout &DL) {
- switch (Inst->getOpcode()) {
- case Instruction::Load: {
- // Currently only handle the case where the Pointer Operand is a GEP.
- // Also we could not vectorize volatile or atomic loads.
- LoadInst *LI = cast<LoadInst>(Inst);
- if (isa<AllocaInst>(User) &&
- LI->getPointerOperandType() == User->getType() &&
- isa<VectorType>(LI->getType()))
- return true;
-
- Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
- if (!PtrInst)
- return false;
-
- return (PtrInst->getOpcode() == Instruction::GetElementPtr ||
- PtrInst->getOpcode() == Instruction::BitCast) &&
- LI->isSimple();
+ if (VarOffsets.size() == 1) {
+ // Only handle cases where we don't need to insert extra arithmetic
+ // instructions.
+ const auto &VarOffset = VarOffsets.front();
+ if (!ConstOffset.isZero() || VarOffset.second != VecElemSize)
+ return nullptr;
+ return VarOffset.first;
}
- case Instruction::BitCast:
- return true;
- case Instruction::Store: {
- // Must be the stored pointer operand, not a stored value, plus
- // since it should be canonical form, the User should be a GEP.
- // Also we could not vectorize volatile or atomic stores.
- StoreInst *SI = cast<StoreInst>(Inst);
- if (isa<AllocaInst>(User) &&
- SI->getPointerOperandType() == User->getType() &&
- isa<VectorType>(SI->getValueOperand()->getType()))
- return true;
- Instruction *UserInst = dyn_cast<Instruction>(User);
- if (!UserInst)
- return false;
+ APInt Quot;
+ uint64_t Rem;
+ APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
+ if (Rem != 0)
+ return nullptr;
- return (SI->getPointerOperand() == User) &&
- (UserInst->getOpcode() == Instruction::GetElementPtr ||
- UserInst->getOpcode() == Instruction::BitCast) &&
- SI->isSimple();
- }
- default:
- return false;
- }
+ return ConstantInt::get(GEP->getContext(), Quot);
}
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
@@ -455,73 +418,87 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
}
std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
- std::vector<Value *> WorkList;
- SmallVector<User *, 8> Users(Alloca->users());
- SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
+ SmallVector<Instruction *> WorkList;
+ SmallVector<Use *, 8> Uses;
+ for (Use &U : Alloca->uses())
+ Uses.push_back(&U);
+
Type *VecEltTy = VectorTy->getElementType();
- while (!Users.empty()) {
- User *AllocaUser = Users.pop_back_val();
- User *UseUser = UseUsers.pop_back_val();
- Instruction *Inst = dyn_cast<Instruction>(AllocaUser);
+ while (!Uses.empty()) {
+ Use *U = Uses.pop_back_val();
+ Instruction *Inst = dyn_cast<Instruction>(U->getUser());
- GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
- if (!GEP) {
- if (!canVectorizeInst(Inst, UseUser, DL))
+ if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
+ // This is a store of the pointer, not to the pointer.
+ if (isa<StoreInst>(Inst) &&
+ U->getOperandNo() != StoreInst::getPointerOperandIndex())
return false;
- if (Inst->getOpcode() == Instruction::BitCast) {
- Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
- Type *ToTy = Inst->getType()->getPointerElementType();
- if (FromTy->isAggregateType() || ToTy->isAggregateType() ||
- DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
- continue;
-
- for (User *CastUser : Inst->users()) {
- if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
- continue;
- Users.push_back(CastUser);
- UseUsers.push_back(Inst);
- }
+ Type *AccessTy = getLoadStoreType(Inst);
+ Ptr = Ptr->stripPointerCasts();
+ // Alloca already accessed as vector, leave alone.
+ if (Ptr == Alloca && DL.getTypeStoreSize(Alloca->getAllocatedType()) ==
+ DL.getTypeStoreSize(AccessTy))
continue;
- }
- WorkList.push_back(AllocaUser);
+ // Check that this is a simple access of a vector element.
+ bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
+ : cast<StoreInst>(Inst)->isSimple();
+ if (!IsSimple ||
+ !CastInst::isBitOrNoopPointerCastable(VecEltTy, AccessTy, DL))
+ return false;
+
+ WorkList.push_back(Inst);
continue;
}
- Value *Index = GEPToVectorIndex(GEP);
+ if (isa<BitCastInst>(Inst)) {
+ // Look through bitcasts.
+ for (Use &U : Inst->uses())
+ Uses.push_back(&U);
+ continue;
+ }
- // If we can't compute a vector index from this GEP, then we can't
- // promote this alloca to vector.
- if (!Index) {
- LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP
- << '\n');
- return false;
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+ // If we can't compute a vector index from this GEP, then we can't
+ // promote this alloca to vector.
+ Value *Index = GEPToVectorIndex(GEP, Alloca, VecEltTy, DL);
+ if (!Index) {
+ LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP
+ << '\n');
+ return false;
+ }
+
+ GEPVectorIdx[GEP] = Index;
+ for (Use &U : Inst->uses())
+ Uses.push_back(&U);
+ continue;
}
- GEPVectorIdx[GEP] = Index;
- Users.append(GEP->user_begin(), GEP->user_end());
- UseUsers.append(GEP->getNumUses(), GEP);
+ // Ignore assume-like intrinsics and comparisons used in assumes.
+ if (isAssumeLikeIntrinsic(Inst))
+ continue;
+
+ if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
+ return isAssumeLikeIntrinsic(cast<Instruction>(U));
+ }))
+ continue;
+
+ // Unknown user.
+ return false;
}
LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
<< *VectorTy << '\n');
- for (Value *V : WorkList) {
- Instruction *Inst = cast<Instruction>(V);
+ for (Instruction *Inst : WorkList) {
IRBuilder<> Builder(Inst);
switch (Inst->getOpcode()) {
case Instruction::Load: {
- if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
- break;
-
Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
- if (!Index)
- break;
-
- Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+ Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
@@ -533,16 +510,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(Inst);
- if (SI->getValueOperand()->getType() == AllocaTy ||
- SI->getValueOperand()->getType()->isVectorTy())
- break;
-
Value *Ptr = SI->getPointerOperand();
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
- if (!Index)
- break;
-
- Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+ Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
Value *Elt = SI->getValueOperand();
@@ -808,10 +778,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
//
// FIXME: We should really do something to fix the addresses to a more optimal
// value instead
- llvm::sort(AllocatedSizes, [](std::pair<uint64_t, Align> LHS,
- std::pair<uint64_t, Align> RHS) {
- return LHS.second < RHS.second;
- });
+ llvm::sort(AllocatedSizes, llvm::less_second());
// Check how much local memory is being used by global objects
CurrentLocalMemUsage = 0;
@@ -917,7 +884,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
// usage order.
//
// FIXME: It is also possible that if we're allowed to use all of the memory
- // could could end up using more than the maximum due to alignment padding.
+ // could end up using more than the maximum due to alignment padding.
uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
index 01d03d17ec47..ed450f59e4b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
@@ -16,7 +16,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "Utils/AMDGPUMemoryUtils.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/InitializePasses.h"
@@ -30,6 +32,8 @@ namespace {
class AMDGPUPromoteKernelArguments : public FunctionPass {
MemorySSA *MSSA;
+ AliasAnalysis *AA;
+
Instruction *ArgCastInsertPt;
SmallVector<Value *> Ptrs;
@@ -38,16 +42,19 @@ class AMDGPUPromoteKernelArguments : public FunctionPass {
bool promotePointer(Value *Ptr);
+ bool promoteLoad(LoadInst *LI);
+
public:
static char ID;
AMDGPUPromoteKernelArguments() : FunctionPass(ID) {}
- bool run(Function &F, MemorySSA &MSSA);
+ bool run(Function &F, MemorySSA &MSSA, AliasAnalysis &AA);
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<MemorySSAWrapperPass>();
AU.setPreservesAll();
}
@@ -68,17 +75,10 @@ void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
break;
case Instruction::Load: {
LoadInst *LD = cast<LoadInst>(U);
- PointerType *PT = dyn_cast<PointerType>(LD->getType());
- if (!PT ||
- (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS &&
- PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
- PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) ||
- LD->getPointerOperand()->stripInBoundsOffsets() != Ptr)
- break;
- const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(LD);
- // TODO: This load poprobably can be promoted to constant address space.
- if (MSSA->isLiveOnEntryDef(MA))
+ if (LD->getPointerOperand()->stripInBoundsOffsets() == Ptr &&
+ !AMDGPU::isClobberedInFunction(LD, MSSA, AA))
Ptrs.push_back(LD);
+
break;
}
case Instruction::GetElementPtr:
@@ -92,15 +92,26 @@ void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
}
bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
- enqueueUsers(Ptr);
+ bool Changed = false;
+
+ LoadInst *LI = dyn_cast<LoadInst>(Ptr);
+ if (LI)
+ Changed |= promoteLoad(LI);
+
+ PointerType *PT = dyn_cast<PointerType>(Ptr->getType());
+ if (!PT)
+ return Changed;
+
+ if (PT->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+ PT->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
+ enqueueUsers(Ptr);
- PointerType *PT = cast<PointerType>(Ptr->getType());
if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
- return false;
+ return Changed;
- bool IsArg = isa<Argument>(Ptr);
- IRBuilder<> B(IsArg ? ArgCastInsertPt
- : &*std::next(cast<Instruction>(Ptr)->getIterator()));
+ IRBuilder<> B(LI ? &*std::next(cast<Instruction>(Ptr)->getIterator())
+ : ArgCastInsertPt);
// Cast pointer to global address space and back to flat and let
// Infer Address Spaces pass to do all necessary rewriting.
@@ -116,6 +127,14 @@ bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
return true;
}
+bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) {
+ if (!LI->isSimple())
+ return false;
+
+ LI->setMetadata("amdgpu.noclobber", MDNode::get(LI->getContext(), {}));
+ return true;
+}
+
// skip allocas
static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
@@ -131,7 +150,8 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
return InsPt;
}
-bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
+bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA,
+ AliasAnalysis &AA) {
if (skipFunction(F))
return false;
@@ -141,6 +161,7 @@ bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
ArgCastInsertPt = &*getInsertPt(*F.begin());
this->MSSA = &MSSA;
+ this->AA = &AA;
for (Argument &Arg : F.args()) {
if (Arg.use_empty())
@@ -166,11 +187,13 @@ bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) {
MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
- return run(F, MSSA);
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ return run(F, MSSA, AA);
}
INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
"AMDGPU Promote Kernel Arguments", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
"AMDGPU Promote Kernel Arguments", false, false)
@@ -185,7 +208,8 @@ PreservedAnalyses
AMDGPUPromoteKernelArgumentsPass::run(Function &F,
FunctionAnalysisManager &AM) {
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
- if (AMDGPUPromoteKernelArguments().run(F, MSSA)) {
+ AliasAnalysis &AA = AM.getResult<AAManager>(F);
+ if (AMDGPUPromoteKernelArguments().run(F, MSSA, AA)) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
PA.preserve<MemorySSAAnalysis>();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index de2dccef804a..0830cbd919a0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -76,10 +76,11 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBank.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#define GET_TARGET_REGBANK_IMPL
@@ -193,9 +194,7 @@ public:
}
AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
- : AMDGPUGenRegisterBankInfo(),
- Subtarget(ST),
- TRI(Subtarget.getRegisterInfo()),
+ : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
TII(Subtarget.getInstrInfo()) {
// HACK: Until this is fully tablegen'd.
@@ -428,11 +427,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
}
}
-static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
- const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
- return I && I->getMetadata("amdgpu.noclobber");
-}
-
// FIXME: Returns uniform if there's no source value information. This is
// probably wrong.
static bool isScalarLoadLegal(const MachineInstr &MI) {
@@ -451,7 +445,7 @@ static bool isScalarLoadLegal(const MachineInstr &MI) {
// spaces.
(IsConst || !MMO->isVolatile()) &&
// Memory must be known constant, or not written before this load.
- (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
+ (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
AMDGPUInstrInfo::isUniformMMO(MMO);
}
@@ -684,6 +678,62 @@ static LLT getHalfSizedType(LLT Ty) {
return LLT::scalar(Ty.getScalarSizeInBits() / 2);
}
+// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
+// source value into a scalar register.
+Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
+ MachineRegisterInfo &MRI,
+ Register Src) const {
+ LLT Ty = MRI.getType(Src);
+ const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
+
+ if (Bank == &AMDGPU::SGPRRegBank)
+ return Src;
+
+ unsigned Bits = Ty.getSizeInBits();
+ assert(Bits % 32 == 0);
+
+ if (Bank != &AMDGPU::VGPRRegBank) {
+ // We need to copy from AGPR to VGPR
+ Src = B.buildCopy(Ty, Src).getReg(0);
+ MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
+ }
+
+ LLT S32 = LLT::scalar(32);
+ unsigned NumParts = Bits / 32;
+ SmallVector<Register, 8> SrcParts;
+ SmallVector<Register, 8> DstParts;
+
+ if (Bits == 32) {
+ SrcParts.push_back(Src);
+ } else {
+ auto Unmerge = B.buildUnmerge(S32, Src);
+ for (unsigned i = 0; i < NumParts; ++i)
+ SrcParts.push_back(Unmerge.getReg(i));
+ }
+
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Register SrcPart = SrcParts[i];
+ Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
+
+ const TargetRegisterClass *Constrained =
+ constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
+ (void)Constrained;
+ assert(Constrained && "Failed to constrain readfirstlane src reg");
+
+ B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
+
+ DstParts.push_back(DstPart);
+ }
+
+ if (Bits == 32)
+ return DstParts[0];
+
+ Register Dst = B.buildMerge(Ty, DstParts).getReg(0);
+ MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
+ return Dst;
+}
+
/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
/// execute the instruction for each unique combination of values in all lanes
@@ -716,8 +766,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MachineFunction *MF = &B.getMF();
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
- const unsigned WaveAndOpc = Subtarget.isWave32() ?
- AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
const unsigned MovExecOpc =
Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
const unsigned MovExecTermOpc =
@@ -747,16 +795,19 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
// To insert the loop we need to split the block. Move everything before this
// point to a new block, and insert a new empty block before this instruction.
MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
MachineFunction::iterator MBBI(MBB);
++MBBI;
MF->insert(MBBI, LoopBB);
+ MF->insert(MBBI, BodyBB);
MF->insert(MBBI, RestoreExecBB);
MF->insert(MBBI, RemainderBB);
- LoopBB->addSuccessor(RestoreExecBB);
- LoopBB->addSuccessor(LoopBB);
+ LoopBB->addSuccessor(BodyBB);
+ BodyBB->addSuccessor(RestoreExecBB);
+ BodyBB->addSuccessor(LoopBB);
// Move the rest of the block into a new block.
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
@@ -768,27 +819,27 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
B.setInsertPt(*LoopBB, LoopBB->end());
B.buildInstr(TargetOpcode::PHI)
- .addDef(PhiExec)
- .addReg(InitSaveExecReg)
- .addMBB(&MBB)
- .addReg(NewExec)
- .addMBB(LoopBB);
+ .addDef(PhiExec)
+ .addReg(InitSaveExecReg)
+ .addMBB(&MBB)
+ .addReg(NewExec)
+ .addMBB(BodyBB);
const DebugLoc &DL = B.getDL();
MachineInstr &FirstInst = *Range.begin();
- // Move the instruction into the loop. Note we moved everything after
+ // Move the instruction into the loop body. Note we moved everything after
// Range.end() already into a new block, so Range.end() is no longer valid.
- LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
+ BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
// Figure out the iterator range after splicing the instructions.
MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
- auto NewEnd = LoopBB->end();
+ auto NewEnd = BodyBB->end();
- MachineBasicBlock::iterator I = Range.begin();
- B.setInsertPt(*LoopBB, I);
+ B.setMBB(*LoopBB);
+ LLT S1 = LLT::scalar(1);
Register CondReg;
assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
@@ -819,164 +870,62 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
B.setMBB(MBB);
OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
- B.setInstr(*I);
+ B.setMBB(*LoopBB);
}
- unsigned OpSize = OpTy.getSizeInBits();
-
- // Can only do a readlane of 32-bit pieces.
- if (OpSize == 32) {
- // Avoid extra copies in the simple case of one 32-bit register.
- Register CurrentLaneOpReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- MRI.setType(CurrentLaneOpReg, OpTy);
-
- constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpReg)
- .addReg(OpReg);
+ Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
- Register NewCondReg = MRI.createVirtualRegister(WaveRC);
- bool First = CondReg == AMDGPU::NoRegister;
- if (First)
- CondReg = NewCondReg;
-
- // Compare the just read M0 value to all possible Idx values.
- B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
- .addDef(NewCondReg)
- .addReg(CurrentLaneOpReg)
- .addReg(OpReg);
- Op.setReg(CurrentLaneOpReg);
-
- if (!First) {
- Register AndReg = MRI.createVirtualRegister(WaveRC);
+ // Build the comparison(s).
+ unsigned OpSize = OpTy.getSizeInBits();
+ bool Is64 = OpSize % 64 == 0;
+ unsigned PartSize = Is64 ? 64 : 32;
+ LLT PartTy = LLT::scalar(PartSize);
+ unsigned NumParts = OpSize / PartSize;
+ SmallVector<Register, 8> OpParts;
+ SmallVector<Register, 8> CurrentLaneParts;
- // If there are multiple operands to consider, and the conditions.
- B.buildInstr(WaveAndOpc)
- .addDef(AndReg)
- .addReg(NewCondReg)
- .addReg(CondReg);
- CondReg = AndReg;
- }
+ if (NumParts == 1) {
+ OpParts.push_back(OpReg);
+ CurrentLaneParts.push_back(CurrentLaneReg);
} else {
- LLT S32 = LLT::scalar(32);
- SmallVector<Register, 8> ReadlanePieces;
-
- // The compares can be done as 64-bit, but the extract needs to be done
- // in 32-bit pieces.
-
- bool Is64 = OpSize % 64 == 0;
-
- unsigned UnmergeTySize = Is64 ? 64 : 32;
- unsigned CmpOp =
- Is64 ? AMDGPU::V_CMP_EQ_U64_e64 : AMDGPU::V_CMP_EQ_U32_e64;
-
- // Insert the unmerge before the loop.
-
- B.setMBB(MBB);
- unsigned NumPieces = OpSize / UnmergeTySize;
- SmallVector<Register, 8> UnmergePieces;
- if (NumPieces == 1) {
- UnmergePieces.push_back(OpReg);
- } else {
- LLT UnmergeTy = LLT::scalar(UnmergeTySize);
- MachineInstrBuilder Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
- for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx)
- UnmergePieces.push_back(Unmerge.getReg(PieceIdx));
+ auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
+ auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
+ for (unsigned i = 0; i < NumParts; ++i) {
+ OpParts.push_back(UnmergeOp.getReg(i));
+ CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
+ MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
+ MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
}
- B.setInstr(*I);
-
- for (Register UnmergePiece : UnmergePieces) {
- Register CurrentLaneOpReg;
- if (Is64) {
- Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
- Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
-
- MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
- MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
- MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
-
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpRegLo)
- .addReg(UnmergePiece, 0, AMDGPU::sub0);
-
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpRegHi)
- .addReg(UnmergePiece, 0, AMDGPU::sub1);
-
- CurrentLaneOpReg =
- B.buildMerge(LLT::scalar(64),
- {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
- .getReg(0);
-
- MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
-
- if (OpTy.getScalarSizeInBits() == 64) {
- // If we need to produce a 64-bit element vector, so use the
- // merged pieces
- ReadlanePieces.push_back(CurrentLaneOpReg);
- } else {
- // 32-bit element type.
- ReadlanePieces.push_back(CurrentLaneOpRegLo);
- ReadlanePieces.push_back(CurrentLaneOpRegHi);
- }
- } else {
- CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
- MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
- MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
-
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpReg)
- .addReg(UnmergePiece);
- ReadlanePieces.push_back(CurrentLaneOpReg);
- }
-
- Register NewCondReg = MRI.createVirtualRegister(WaveRC);
- bool First = CondReg == AMDGPU::NoRegister;
- if (First)
- CondReg = NewCondReg;
-
- B.buildInstr(CmpOp)
- .addDef(NewCondReg)
- .addReg(CurrentLaneOpReg)
- .addReg(UnmergePiece);
+ }
- if (!First) {
- Register AndReg = MRI.createVirtualRegister(WaveRC);
+ for (unsigned i = 0; i < NumParts; ++i) {
+ auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
+ OpParts[i]).getReg(0);
+ MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
- // If there are multiple operands to consider, and the conditions.
- B.buildInstr(WaveAndOpc)
- .addDef(AndReg)
- .addReg(NewCondReg)
- .addReg(CondReg);
- CondReg = AndReg;
- }
- }
-
- // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
- // BUILD_VECTOR
- if (OpTy.isVector()) {
- auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
- Op.setReg(Merge.getReg(0));
- MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
- } else if (ReadlanePieces.size() > 1) {
- auto Merge = B.buildMerge(OpTy, ReadlanePieces);
- Op.setReg(Merge.getReg(0));
- MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
+ if (!CondReg) {
+ CondReg = CmpReg;
} else {
- Op.setReg(ReadlanePieces[0]);
+ CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
+ MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
}
}
+ Op.setReg(CurrentLaneReg);
+
// Make sure we don't re-process this register again.
WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
}
}
+ // The ballot becomes a no-op during instruction selection.
+ CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
+ {LLT::scalar(Subtarget.isWave32() ? 32 : 64)},
+ false)
+ .addReg(CondReg)
+ .getReg(0);
+ MRI.setRegClass(CondReg, WaveRC);
+
// Update EXEC, save the original EXEC value to VCC.
B.buildInstr(AndSaveExecOpc)
.addDef(NewExec)
@@ -984,7 +933,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MRI.setSimpleHint(NewExec, CondReg);
- B.setInsertPt(*LoopBB, LoopBB->end());
+ B.setInsertPt(*BodyBB, BodyBB->end());
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
B.buildInstr(XorTermOpc)
@@ -1064,28 +1013,10 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
if (Bank == &AMDGPU::SGPRRegBank)
return;
- LLT Ty = MRI.getType(Reg);
MachineIRBuilder B(MI);
- if (Bank != &AMDGPU::VGPRRegBank) {
- // We need to copy from AGPR to VGPR
- Reg = B.buildCopy(Ty, Reg).getReg(0);
- MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
- }
-
- Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
- .addDef(SGPR)
- .addReg(Reg);
-
- MRI.setType(SGPR, Ty);
-
- const TargetRegisterClass *Constrained =
- constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
- (void)Constrained;
- assert(Constrained && "Failed to constrain readfirstlane src reg");
-
- MI.getOperand(OpIdx).setReg(SGPR);
+ Reg = buildReadFirstLane(B, MRI, Reg);
+ MI.getOperand(OpIdx).setReg(Reg);
}
/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
@@ -1624,6 +1555,157 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
return true;
}
+bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
+ const OperandsMapper &OpdMapper) const {
+ MachineInstr &MI = OpdMapper.getMI();
+ MachineRegisterInfo &MRI = OpdMapper.getMRI();
+
+ // Insert basic copies.
+ applyDefaultMapping(OpdMapper);
+
+ Register Dst0 = MI.getOperand(0).getReg();
+ Register Dst1 = MI.getOperand(1).getReg();
+ Register Src0 = MI.getOperand(2).getReg();
+ Register Src1 = MI.getOperand(3).getReg();
+ Register Src2 = MI.getOperand(4).getReg();
+
+ if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
+ return true;
+
+ bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
+ LLT S1 = LLT::scalar(1);
+ LLT S32 = LLT::scalar(32);
+
+ bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
+ bool Accumulate = true;
+
+ if (!DstOnValu) {
+ if (mi_match(Src2, MRI, m_ZeroInt()))
+ Accumulate = false;
+ }
+
+ // Keep the multiplication on the SALU.
+ MachineIRBuilder B(MI);
+
+ Register DstHi;
+ Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
+ bool MulHiInVgpr = false;
+
+ MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
+
+ if (Subtarget.hasSMulHi()) {
+ DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
+ : B.buildSMulH(S32, Src0, Src1).getReg(0);
+ MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
+ } else {
+ Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
+ Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
+
+ MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
+ MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
+
+ DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
+ : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
+ MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
+
+ if (!DstOnValu) {
+ DstHi = buildReadFirstLane(B, MRI, DstHi);
+ } else {
+ MulHiInVgpr = true;
+ }
+ }
+
+ // Accumulate and produce the "carry-out" bit.
+ //
+ // The "carry-out" is defined as bit 64 of the result when computed as a
+ // big integer. For unsigned multiply-add, this matches the usual definition
+ // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
+ // result, which is determined as:
+ // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
+ LLT CarryType = DstOnValu ? S1 : S32;
+ const RegisterBank &CarryBank =
+ DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
+ const RegisterBank &DstBank =
+ DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
+ Register Carry;
+ Register Zero;
+
+ if (!IsUnsigned) {
+ Zero = B.buildConstant(S32, 0).getReg(0);
+ MRI.setRegBank(Zero,
+ MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
+
+ Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
+ .getReg(0);
+ MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
+ : AMDGPU::SGPRRegBank);
+
+ if (DstOnValu && !MulHiInVgpr) {
+ Carry = B.buildTrunc(S1, Carry).getReg(0);
+ MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
+ }
+ }
+
+ if (Accumulate) {
+ if (DstOnValu) {
+ DstLo = B.buildCopy(S32, DstLo).getReg(0);
+ DstHi = B.buildCopy(S32, DstHi).getReg(0);
+ MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
+ MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
+ }
+
+ auto Unmerge = B.buildUnmerge(S32, Src2);
+ Register Src2Lo = Unmerge.getReg(0);
+ Register Src2Hi = Unmerge.getReg(1);
+ MRI.setRegBank(Src2Lo, DstBank);
+ MRI.setRegBank(Src2Hi, DstBank);
+
+ if (!IsUnsigned) {
+ auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
+ MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
+
+ Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
+ MRI.setRegBank(Carry, CarryBank);
+ }
+
+ auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
+ DstLo = AddLo.getReg(0);
+ Register CarryLo = AddLo.getReg(1);
+ MRI.setRegBank(DstLo, DstBank);
+ MRI.setRegBank(CarryLo, CarryBank);
+
+ auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
+ DstHi = AddHi.getReg(0);
+ MRI.setRegBank(DstHi, DstBank);
+
+ Register CarryHi = AddHi.getReg(1);
+ MRI.setRegBank(CarryHi, CarryBank);
+
+ if (IsUnsigned) {
+ Carry = CarryHi;
+ } else {
+ Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
+ MRI.setRegBank(Carry, CarryBank);
+ }
+ } else {
+ if (IsUnsigned) {
+ Carry = B.buildConstant(CarryType, 0).getReg(0);
+ MRI.setRegBank(Carry, CarryBank);
+ }
+ }
+
+ B.buildMerge(Dst0, {DstLo, DstHi});
+
+ if (DstOnValu) {
+ B.buildCopy(Dst1, Carry);
+ } else {
+ B.buildTrunc(Dst1, Carry);
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
// Return a suitable opcode for extending the operands of Opc when widening.
static unsigned getExtendOp(unsigned Opc) {
switch (Opc) {
@@ -1794,7 +1876,7 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
}
/// Utility function for pushing dynamic vector indexes with a constant offset
-/// into waterwall loops.
+/// into waterfall loops.
static void reinsertVectorIndexAdd(MachineIRBuilder &B,
MachineInstr &IdxUseInstr,
unsigned OpIdx,
@@ -1857,7 +1939,7 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
unsigned NumElem = VecTy.getNumElements();
if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
- IsDivergentIdx))
+ IsDivergentIdx, &Subtarget))
return false;
MachineIRBuilder B(MI);
@@ -1955,7 +2037,7 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
unsigned NumElem = VecTy.getNumElements();
if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
- IsDivergentIdx))
+ IsDivergentIdx, &Subtarget))
return false;
MachineIRBuilder B(MI);
@@ -2926,7 +3008,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case Intrinsic::amdgcn_interp_p2:
case Intrinsic::amdgcn_interp_mov:
case Intrinsic::amdgcn_interp_p1_f16:
- case Intrinsic::amdgcn_interp_p2_f16: {
+ case Intrinsic::amdgcn_interp_p2_f16:
+ case Intrinsic::amdgcn_lds_param_load: {
applyDefaultMapping(OpdMapper);
// Readlane for m0 value, which is always the last operand.
@@ -2934,6 +3017,12 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
return;
}
+ case Intrinsic::amdgcn_interp_inreg_p10:
+ case Intrinsic::amdgcn_interp_inreg_p2:
+ case Intrinsic::amdgcn_interp_inreg_p10_f16:
+ case Intrinsic::amdgcn_interp_inreg_p2_f16:
+ applyDefaultMapping(OpdMapper);
+ return;
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16: {
// Doing a waterfall loop over these wouldn't make any sense.
@@ -3015,6 +3104,35 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(MI, MRI, 2);
return;
}
+ case Intrinsic::amdgcn_raw_buffer_load_lds: {
+ applyDefaultMapping(OpdMapper);
+ constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
+ constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
+ return;
+ }
+ case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ applyDefaultMapping(OpdMapper);
+ constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
+ constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
+ return;
+ }
+ case Intrinsic::amdgcn_global_load_lds: {
+ applyDefaultMapping(OpdMapper);
+ constrainOpWithReadfirstlane(MI, MRI, 2);
+ return;
+ }
+ case Intrinsic::amdgcn_lds_direct_load: {
+ applyDefaultMapping(OpdMapper);
+ // Readlane for m0 value, which is always the last operand.
+ constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
+ return;
+ }
+ case Intrinsic::amdgcn_exp_row:
+ applyDefaultMapping(OpdMapper);
+ constrainOpWithReadfirstlane(MI, MRI, 8); // M0
+ return;
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -3143,6 +3261,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_UBFX:
applyMappingBFE(OpdMapper, /*Signed*/ false);
return;
+ case AMDGPU::G_AMDGPU_MAD_U64_U32:
+ case AMDGPU::G_AMDGPU_MAD_I64_I32:
+ applyMappingMAD_64_32(OpdMapper);
+ return;
default:
break;
}
@@ -3668,6 +3790,48 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
+ case AMDGPU::G_AMDGPU_MAD_U64_U32:
+ case AMDGPU::G_AMDGPU_MAD_I64_I32: {
+ // Three possible mappings:
+ //
+ // - Default SOP
+ // - Default VOP
+ // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
+ //
+ // This allows instruction selection to keep the multiplication part of the
+ // instruction on the SALU.
+ bool AllSalu = true;
+ bool MulSalu = true;
+ for (unsigned i = 0; i < 5; ++i) {
+ Register Reg = MI.getOperand(i).getReg();
+ if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
+ if (Bank->getID() != AMDGPU::SGPRRegBankID) {
+ AllSalu = false;
+ if (i == 2 || i == 3) {
+ MulSalu = false;
+ break;
+ }
+ }
+ }
+ }
+
+ if (AllSalu)
+ return getDefaultMappingSOP(MI);
+
+ // If the multiply-add is full-rate in VALU, use that even if the
+ // multiplication part is scalar. Accumulating separately on the VALU would
+ // take two instructions.
+ if (!MulSalu || Subtarget.hasFullRate64Ops())
+ return getDefaultMappingVOP(MI);
+
+ // Keep the multiplication on the SALU, then accumulate on the VALU.
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
+ break;
+ }
case AMDGPU::G_IMPLICIT_DEF: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
@@ -3828,10 +3992,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case AMDGPU::G_FCMP: {
unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
- unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
OpdsMapping[1] = nullptr; // Predicate Operand.
- OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
break;
}
@@ -4102,6 +4265,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_udot4:
case Intrinsic::amdgcn_sdot8:
case Intrinsic::amdgcn_udot8:
+ case Intrinsic::amdgcn_fdot2_bf16_bf16:
+ case Intrinsic::amdgcn_fdot2_f16_f16:
+ case Intrinsic::amdgcn_fdot2_f32_bf16:
+ case Intrinsic::amdgcn_sudot4:
+ case Intrinsic::amdgcn_sudot8:
+ case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
+ case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
+ case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
+ case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_sbfe:
case Intrinsic::amdgcn_ubfe:
@@ -4120,6 +4294,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wqm:
case Intrinsic::amdgcn_softwqm:
case Intrinsic::amdgcn_set_inactive:
+ case Intrinsic::amdgcn_permlane64:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_kernarg_segment_ptr:
case Intrinsic::amdgcn_s_getpc:
@@ -4247,24 +4422,50 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
- case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
+ case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
+ case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
+ case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
+ case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
+ case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: {
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
// for srcA/srcB?
//
// vdst, srcA, srcB, srcC
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ OpdsMapping[0] =
+ Info->mayNeedAGPRs()
+ ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[4] =
+ Info->mayNeedAGPRs()
+ ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ break;
+ }
+ case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
+ case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
+ case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
+ case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: {
+ // vdst, srcA, srcB, srcC, idx
OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_interp_p1:
case Intrinsic::amdgcn_interp_p2:
case Intrinsic::amdgcn_interp_mov:
case Intrinsic::amdgcn_interp_p1_f16:
- case Intrinsic::amdgcn_interp_p2_f16: {
+ case Intrinsic::amdgcn_interp_p2_f16:
+ case Intrinsic::amdgcn_lds_param_load: {
const int M0Idx = MI.getNumOperands() - 1;
Register M0Reg = MI.getOperand(M0Idx).getReg();
unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
@@ -4279,6 +4480,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
break;
}
+ case Intrinsic::amdgcn_interp_inreg_p10:
+ case Intrinsic::amdgcn_interp_inreg_p2:
+ case Intrinsic::amdgcn_interp_inreg_p10_f16:
+ case Intrinsic::amdgcn_interp_inreg_p2_f16: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ break;
+ }
case Intrinsic::amdgcn_ballot: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
@@ -4314,8 +4526,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
} else {
// NSA form
- for (unsigned I = 2; I < N; ++I)
- OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ for (unsigned I = 2; I < N; ++I) {
+ unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
+ OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ }
}
break;
}
@@ -4325,7 +4539,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_s_getreg:
case Intrinsic::amdgcn_s_memtime:
case Intrinsic::amdgcn_s_memrealtime:
- case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
+ case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
+ case Intrinsic::amdgcn_s_sendmsg_rtn: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
@@ -4337,6 +4552,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
@@ -4366,6 +4583,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
break;
+ case Intrinsic::amdgcn_exp_row:
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
+ break;
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// This must be an SGPR, but accept a VGPR.
@@ -4412,6 +4636,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_raw_buffer_load_lds: {
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+ break;
+ }
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_raw_buffer_store_format:
case Intrinsic::amdgcn_raw_tbuffer_store: {
@@ -4430,6 +4661,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+ OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
+ break;
+ }
case Intrinsic::amdgcn_struct_buffer_store:
case Intrinsic::amdgcn_struct_tbuffer_store: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
@@ -4464,6 +4703,31 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_global_load_lds: {
+ OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
+ }
+ case Intrinsic::amdgcn_lds_direct_load: {
+ const int M0Idx = MI.getNumOperands() - 1;
+ Register M0Reg = MI.getOperand(M0Idx).getReg();
+ unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+ for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
+ OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+
+ // Must be SGPR, but we must take whatever the original bank is and fix it
+ // later.
+ OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
+ break;
+ }
+ case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
+ case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
+ OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
default:
return getInvalidInstructionMapping();
}
@@ -4568,6 +4832,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
break;
}
+ case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
+ case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
+ return getDefaultMappingVOP(MI);
}
return getInstructionMapping(/*ID*/1, /*Cost*/1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 2b9d0923ab49..c9741c2202e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -16,7 +16,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/Register.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
#define GET_REGBANK_DECLARATIONS
#include "AMDGPUGenRegisterBank.inc"
@@ -59,6 +59,9 @@ public:
SmallSet<Register, 4> &SGPROperandRegs,
MachineRegisterInfo &MRI) const;
+ Register buildReadFirstLane(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ Register Src) const;
+
bool executeInWaterfallLoop(MachineIRBuilder &B,
MachineInstr &MI,
MachineRegisterInfo &MRI,
@@ -83,6 +86,8 @@ public:
bool applyMappingBFE(const OperandsMapper &OpdMapper, bool Signed) const;
+ bool applyMappingMAD_64_32(const OperandsMapper &OpdMapper) const;
+
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp
new file mode 100644
index 000000000000..a86871a4a653
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp
@@ -0,0 +1,140 @@
+//===- AMDGPUReleaseVGPRs.cpp - Automatically release vgprs on GFX11+ -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert S_SENDMSG instructions to release vgprs on GFX11+.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineOperand.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "release-vgprs"
+
+namespace {
+
+class AMDGPUReleaseVGPRs : public MachineFunctionPass {
+public:
+ static char ID;
+
+ const SIInstrInfo *SII;
+ const SIRegisterInfo *TRI;
+
+ AMDGPUReleaseVGPRs() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ // Used to cache the result of isLastInstructionVMEMStore for each block
+ using BlockVMEMStoreType = DenseMap<MachineBasicBlock *, bool>;
+ BlockVMEMStoreType BlockVMEMStore;
+
+ // Return true if the last instruction referencing a vgpr in this MBB
+ // is a VMEM store, otherwise return false.
+ // Visit previous basic blocks to find this last instruction if needed.
+ // Because this pass is late in the pipeline, it is expected that the
+ // last vgpr use will likely be one of vmem store, ds, exp.
+ // Loads and others vgpr operations would have been
+ // deleted by this point, except for complex control flow involving loops.
+ // This is why we are just testing the type of instructions rather
+ // than the operands.
+ bool isLastVGPRUseVMEMStore(MachineBasicBlock &MBB) {
+ // Use the cache to break infinite loop and save some time. Initialize to
+ // false in case we have a cycle.
+ BlockVMEMStoreType::iterator It;
+ bool Inserted;
+ std::tie(It, Inserted) = BlockVMEMStore.insert({&MBB, false});
+ bool &CacheEntry = It->second;
+ if (!Inserted)
+ return CacheEntry;
+
+ for (auto &MI : reverse(MBB.instrs())) {
+ // If it's a VMEM store, a vgpr will be used, return true.
+ if ((SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI)) && MI.mayStore())
+ return CacheEntry = true;
+
+ // If it's referencing a VGPR but is not a VMEM store, return false.
+ if (SIInstrInfo::isDS(MI) || SIInstrInfo::isEXP(MI) ||
+ SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI) ||
+ SIInstrInfo::isVALU(MI))
+ return CacheEntry = false;
+ }
+
+ // Recursive call into parent blocks. Look into predecessors if there is no
+ // vgpr used in this block.
+ return CacheEntry = llvm::any_of(MBB.predecessors(),
+ [this](MachineBasicBlock *Parent) {
+ return isLastVGPRUseVMEMStore(*Parent);
+ });
+ }
+
+ bool runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+
+ bool Changed = false;
+
+ for (auto &MI : MBB.terminators()) {
+ // Look for S_ENDPGM instructions
+ if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
+ MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+ // If the last instruction using a VGPR in the block is a VMEM store,
+ // release VGPRs. The VGPRs release will be placed just before ending
+ // the program
+ if (isLastVGPRUseVMEMStore(MBB)) {
+ BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_SENDMSG))
+ .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ Function &F = MF.getFunction();
+ if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+ return false;
+
+ // This pass only runs on GFX11+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (ST.getGeneration() < AMDGPUSubtarget::GFX11)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "AMDGPUReleaseVGPRs running on " << MF.getName()
+ << "\n");
+
+ SII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+
+ bool Changed = false;
+ for (auto &MBB : MF) {
+ Changed |= runOnMachineBasicBlock(MBB);
+ }
+
+ BlockVMEMStore.clear();
+
+ return Changed;
+ }
+};
+
+} // namespace
+
+char AMDGPUReleaseVGPRs::ID = 0;
+
+char &llvm::AMDGPUReleaseVGPRsID = AMDGPUReleaseVGPRs::ID;
+
+INITIALIZE_PASS(AMDGPUReleaseVGPRs, DEBUG_TYPE, "Release VGPRs", false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
index 2475b44b42a3..4d7a3f4028e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
@@ -83,7 +83,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "Utils/AMDGPULDSUtils.h"
+#include "Utils/AMDGPUMemoryUtils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetOperations.h"
@@ -442,7 +442,7 @@ class CollectReachableCallees {
continue;
for (const auto &GI : *CGN) {
- auto *RCB = cast<CallBase>(GI.first.getValue());
+ auto *RCB = cast<CallBase>(*GI.first);
auto *RCGN = GI.second;
if (auto *DCallee = RCGN->getFunction()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index cb511e5e3483..f7f93c75c870 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -27,7 +27,9 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalValue.h"
@@ -87,9 +89,7 @@ int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
- if (ST.hasGFX90AInsts() && ArgNumAGPR)
- return alignTo(ArgNumVGPR, 4) + ArgNumAGPR;
- return std::max(ArgNumVGPR, ArgNumAGPR);
+ return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
}
int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
@@ -97,28 +97,31 @@ int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
}
-bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
+bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC)
return false;
+ MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
const TargetMachine &TM = TPC->getTM<TargetMachine>();
bool HasIndirectCall = false;
- for (CallGraphNode *I : SCC) {
- Function *F = I->getFunction();
+ CallGraph CG = CallGraph(M);
+ auto End = po_end(&CG);
+
+ for (auto IT = po_begin(&CG); IT != End; ++IT) {
+ Function *F = IT->getFunction();
if (!F || F->isDeclaration())
continue;
- MachineModuleInfo &MMI =
- getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
- MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
+ MachineFunction *MF = MMI.getMachineFunction(*F);
+ assert(MF && "function must have been generated already");
auto CI = CallGraphResourceInfo.insert(
- std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
+ std::make_pair(F, SIFunctionResourceInfo()));
SIFunctionResourceInfo &Info = CI.first->second;
assert(CI.second && "should only be called once per function");
- Info = analyzeResourceUsage(MF, TM);
+ Info = analyzeResourceUsage(*MF, TM);
HasIndirectCall |= Info.HasIndirectCall;
}
@@ -246,6 +249,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT:
case AMDGPU::SGPR_NULL:
+ case AMDGPU::SGPR_NULL64:
case AMDGPU::MODE:
continue;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index b0a2d3bffc62..df0789e471c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -24,7 +24,7 @@ class GCNSubtarget;
class MachineFunction;
class TargetMachine;
-struct AMDGPUResourceUsageAnalysis : public CallGraphSCCPass {
+struct AMDGPUResourceUsageAnalysis : public ModulePass {
static char ID;
public:
@@ -50,15 +50,15 @@ public:
int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
};
- AMDGPUResourceUsageAnalysis() : CallGraphSCCPass(ID) {}
+ AMDGPUResourceUsageAnalysis() : ModulePass(ID) {}
- bool runOnSCC(CallGraphSCC &SCC) override;
-
- bool doInitialization(CallGraph &CG) override {
+ bool doInitialization(Module &M) override {
CallGraphResourceInfo.clear();
- return CallGraphSCCPass::doInitialization(CG);
+ return ModulePass::doInitialization(M);
}
+ bool runOnModule(Module &M) override;
+
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineModuleInfoWrapperPass>();
AU.setPreservesAll();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 1c6c63dd5b25..4f8a61a77097 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -83,12 +83,8 @@ private:
const DataLayout *DL = nullptr;
MemoryDependenceResults *MDA = nullptr;
- bool checkArgumentUses(Value &Arg) const;
- bool isOutArgumentCandidate(Argument &Arg) const;
-
-#ifndef NDEBUG
- bool isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const;
-#endif
+ Type *getStoredType(Value &Arg) const;
+ Type *getOutArgumentType(Argument &Arg) const;
public:
static char ID;
@@ -114,72 +110,61 @@ INITIALIZE_PASS_END(AMDGPURewriteOutArguments, DEBUG_TYPE,
char AMDGPURewriteOutArguments::ID = 0;
-bool AMDGPURewriteOutArguments::checkArgumentUses(Value &Arg) const {
+Type *AMDGPURewriteOutArguments::getStoredType(Value &Arg) const {
const int MaxUses = 10;
int UseCount = 0;
- for (Use &U : Arg.uses()) {
- StoreInst *SI = dyn_cast<StoreInst>(U.getUser());
- if (UseCount > MaxUses)
- return false;
-
- if (!SI) {
- auto *BCI = dyn_cast<BitCastInst>(U.getUser());
- if (!BCI || !BCI->hasOneUse())
- return false;
+ SmallVector<Use *> Worklist;
+ for (Use &U : Arg.uses())
+ Worklist.push_back(&U);
- // We don't handle multiple stores currently, so stores to aggregate
- // pointers aren't worth the trouble since they are canonically split up.
- Type *DestEltTy = BCI->getType()->getPointerElementType();
- if (DestEltTy->isAggregateType())
- return false;
+ Type *StoredType = nullptr;
+ while (!Worklist.empty()) {
+ Use *U = Worklist.pop_back_val();
- // We could handle these if we had a convenient way to bitcast between
- // them.
- Type *SrcEltTy = Arg.getType()->getPointerElementType();
- if (SrcEltTy->isArrayTy())
- return false;
+ if (auto *BCI = dyn_cast<BitCastInst>(U->getUser())) {
+ for (Use &U : BCI->uses())
+ Worklist.push_back(&U);
+ continue;
+ }
- // Special case handle structs with single members. It is useful to handle
- // some casts between structs and non-structs, but we can't bitcast
- // directly between them. Blender uses some casts that look like
- // { <3 x float> }* to <4 x float>*
- if ((SrcEltTy->isStructTy() && (SrcEltTy->getStructNumElements() != 1)))
- return false;
+ if (auto *SI = dyn_cast<StoreInst>(U->getUser())) {
+ if (UseCount++ > MaxUses)
+ return nullptr;
- // Clang emits OpenCL 3-vector type accesses with a bitcast to the
- // equivalent 4-element vector and accesses that, and we're looking for
- // this pointer cast.
- if (DL->getTypeAllocSize(SrcEltTy) != DL->getTypeAllocSize(DestEltTy))
- return false;
+ if (!SI->isSimple() ||
+ U->getOperandNo() != StoreInst::getPointerOperandIndex())
+ return nullptr;
- return checkArgumentUses(*BCI);
+ if (StoredType && StoredType != SI->getValueOperand()->getType())
+ return nullptr; // More than one type.
+ StoredType = SI->getValueOperand()->getType();
+ continue;
}
- if (!SI->isSimple() ||
- U.getOperandNo() != StoreInst::getPointerOperandIndex())
- return false;
-
- ++UseCount;
+ // Unsupported user.
+ return nullptr;
}
- // Skip unused arguments.
- return UseCount > 0;
+ return StoredType;
}
-bool AMDGPURewriteOutArguments::isOutArgumentCandidate(Argument &Arg) const {
+Type *AMDGPURewriteOutArguments::getOutArgumentType(Argument &Arg) const {
const unsigned MaxOutArgSizeBytes = 4 * MaxNumRetRegs;
PointerType *ArgTy = dyn_cast<PointerType>(Arg.getType());
// TODO: It might be useful for any out arguments, not just privates.
if (!ArgTy || (ArgTy->getAddressSpace() != DL->getAllocaAddrSpace() &&
!AnyAddressSpace) ||
- Arg.hasByValAttr() || Arg.hasStructRetAttr() ||
- DL->getTypeStoreSize(ArgTy->getPointerElementType()) > MaxOutArgSizeBytes) {
- return false;
+ Arg.hasByValAttr() || Arg.hasStructRetAttr()) {
+ return nullptr;
}
- return checkArgumentUses(Arg);
+ Type *StoredType = getStoredType(Arg);
+ if (!StoredType || DL->getTypeStoreSize(StoredType) > MaxOutArgSizeBytes)
+ return nullptr;
+
+ return StoredType;
}
bool AMDGPURewriteOutArguments::doInitialization(Module &M) {
@@ -187,22 +172,6 @@ bool AMDGPURewriteOutArguments::doInitialization(Module &M) {
return false;
}
-#ifndef NDEBUG
-bool AMDGPURewriteOutArguments::isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const {
- auto *VT0 = dyn_cast<FixedVectorType>(Ty0);
- auto *VT1 = dyn_cast<FixedVectorType>(Ty1);
- if (!VT0 || !VT1)
- return false;
-
- if (VT0->getNumElements() != 3 ||
- VT1->getNumElements() != 4)
- return false;
-
- return DL->getTypeSizeInBits(VT0->getElementType()) ==
- DL->getTypeSizeInBits(VT1->getElementType());
-}
-#endif
-
bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
@@ -215,7 +184,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
MDA = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
unsigned ReturnNumRegs = 0;
- SmallSet<int, 4> OutArgIndexes;
+ SmallDenseMap<int, Type *, 4> OutArgIndexes;
SmallVector<Type *, 4> ReturnTypes;
Type *RetTy = F.getReturnType();
if (!RetTy->isVoidTy()) {
@@ -227,12 +196,12 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
ReturnTypes.push_back(RetTy);
}
- SmallVector<Argument *, 4> OutArgs;
+ SmallVector<std::pair<Argument *, Type *>, 4> OutArgs;
for (Argument &Arg : F.args()) {
- if (isOutArgumentCandidate(Arg)) {
+ if (Type *Ty = getOutArgumentType(Arg)) {
LLVM_DEBUG(dbgs() << "Found possible out argument " << Arg
<< " in function " << F.getName() << '\n');
- OutArgs.push_back(&Arg);
+ OutArgs.push_back({&Arg, Ty});
}
}
@@ -264,11 +233,12 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
// first. On the second iteration we've removed that out clobbering argument
// (by effectively moving it into another function) and will find the second
// argument is OK to move.
- for (Argument *OutArg : OutArgs) {
+ for (const auto &Pair : OutArgs) {
bool ThisReplaceable = true;
SmallVector<std::pair<ReturnInst *, StoreInst *>, 4> ReplaceableStores;
- Type *ArgTy = OutArg->getType()->getPointerElementType();
+ Argument *OutArg = Pair.first;
+ Type *ArgTy = Pair.second;
// Skip this argument if converting it will push us over the register
// count to return limit.
@@ -324,7 +294,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
if (ThisReplaceable) {
ReturnTypes.push_back(ArgTy);
- OutArgIndexes.insert(OutArg->getArgNo());
+ OutArgIndexes.insert({OutArg->getArgNo(), ArgTy});
++NumOutArgumentsReplaced;
Changing = true;
}
@@ -376,32 +346,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
if (RetVal)
NewRetVal = B.CreateInsertValue(NewRetVal, RetVal, RetIdx++);
- for (std::pair<Argument *, Value *> ReturnPoint : Replacement.second) {
- Argument *Arg = ReturnPoint.first;
- Value *Val = ReturnPoint.second;
- Type *EltTy = Arg->getType()->getPointerElementType();
- if (Val->getType() != EltTy) {
- Type *EffectiveEltTy = EltTy;
- if (StructType *CT = dyn_cast<StructType>(EltTy)) {
- assert(CT->getNumElements() == 1);
- EffectiveEltTy = CT->getElementType(0);
- }
-
- if (DL->getTypeSizeInBits(EffectiveEltTy) !=
- DL->getTypeSizeInBits(Val->getType())) {
- assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType()));
- Val = B.CreateShuffleVector(Val, ArrayRef<int>{0, 1, 2});
- }
-
- Val = B.CreateBitCast(Val, EffectiveEltTy);
-
- // Re-create single element composite.
- if (EltTy != EffectiveEltTy)
- Val = B.CreateInsertValue(UndefValue::get(EltTy), Val, 0);
- }
-
- NewRetVal = B.CreateInsertValue(NewRetVal, Val, RetIdx++);
- }
+ for (std::pair<Argument *, Value *> ReturnPoint : Replacement.second)
+ NewRetVal = B.CreateInsertValue(NewRetVal, ReturnPoint.second, RetIdx++);
if (RetVal)
RI->setOperand(0, NewRetVal);
@@ -433,7 +379,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
PointerType *ArgType = cast<PointerType>(Arg.getType());
- auto *EltTy = ArgType->getPointerElementType();
+ Type *EltTy = OutArgIndexes[Arg.getArgNo()];
const auto Align =
DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index afe016731395..8297635d7bb2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -39,7 +39,8 @@ class GcnBufferFormatBase<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bi
}
class Gfx9BufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>;
-class Gfx10PlusBufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>;
+class Gfx10BufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>;
+class Gfx11PlusBufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>;
class GcnBufferFormatTable : GenericTable {
let CppTypeName = "GcnBufferFormatInfo";
@@ -51,17 +52,25 @@ def Gfx9BufferFormat : GcnBufferFormatTable {
let FilterClass = "Gfx9BufferFormat";
let PrimaryKeyName = "getGfx9BufferFormatInfo";
}
-def Gfx10PlusBufferFormat : GcnBufferFormatTable {
- let FilterClass = "Gfx10PlusBufferFormat";
- let PrimaryKeyName = "getGfx10PlusBufferFormatInfo";
+def Gfx10BufferFormat : GcnBufferFormatTable {
+ let FilterClass = "Gfx10BufferFormat";
+ let PrimaryKeyName = "getGfx10BufferFormatInfo";
+}
+def Gfx11PlusBufferFormat : GcnBufferFormatTable {
+ let FilterClass = "Gfx11PlusBufferFormat";
+ let PrimaryKeyName = "getGfx11PlusBufferFormatInfo";
}
def getGfx9BufferFormatInfo : SearchIndex {
let Table = Gfx9BufferFormat;
let Key = ["Format"];
}
-def getGfx10PlusBufferFormatInfo : SearchIndex {
- let Table = Gfx10PlusBufferFormat;
+def getGfx10BufferFormatInfo : SearchIndex {
+ let Table = Gfx10BufferFormat;
+ let Key = ["Format"];
+}
+def getGfx11PlusBufferFormatInfo : SearchIndex {
+ let Table = Gfx11PlusBufferFormat;
let Key = ["Format"];
}
@@ -119,57 +128,87 @@ def : Gfx9BufferFormat< /*FORMAT_32_32_32_32_SINT*/ 0x5E, 32, 4, /*NUM_FORMA
def : Gfx9BufferFormat< /*FORMAT_32_32_32_32_FLOAT*/ 0x7E, 32, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32_32*/ 14>;
// Buffer formats with equal component sizes (GFX10 and later)
-def : Gfx10PlusBufferFormat< /*FORMAT_8_UNORM*/ 0x01, 8, 1, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8*/ 1>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_SNORM*/ 0x02, 8, 1, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8*/ 1>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_USCALED*/ 0x03, 8, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8*/ 1>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_SSCALED*/ 0x04, 8, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8*/ 1>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_UINT*/ 0x05, 8, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8*/ 1>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_SINT*/ 0x06, 8, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8*/ 1>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_UNORM*/ 0x07, 16, 1, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16*/ 2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_SNORM*/ 0x08, 16, 1, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16*/ 2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_USCALED*/ 0x09, 16, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16*/ 2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_SSCALED*/ 0x0A, 16, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16*/ 2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_UINT*/ 0x0B, 16, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16*/ 2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_SINT*/ 0x0C, 16, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16*/ 2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_FLOAT*/ 0x0D, 16, 1, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16*/ 2>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_UNORM*/ 0x0E, 8, 2, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8*/ 3>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_SNORM*/ 0x0F, 8, 2, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8*/ 3>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_USCALED*/ 0x10, 8, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8*/ 3>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_SSCALED*/ 0x11, 8, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8*/ 3>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_UINT*/ 0x12, 8, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8*/ 3>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_SINT*/ 0x13, 8, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8*/ 3>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_UINT*/ 0x14, 32, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32*/ 4>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_SINT*/ 0x15, 32, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32*/ 4>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_FLOAT*/ 0x16, 32, 1, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32*/ 4>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_UNORM*/ 0x17, 16, 2, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16*/ 5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_SNORM*/ 0x18, 16, 2, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16*/ 5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_USCALED*/ 0x19, 16, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16*/ 5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_SSCALED*/ 0x1A, 16, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16*/ 5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_UINT*/ 0x1B, 16, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16*/ 5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_SINT*/ 0x1C, 16, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16*/ 5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_FLOAT*/ 0x1D, 16, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16*/ 5>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_UNORM*/ 0x38, 8, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8_8_8*/ 10>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_SNORM*/ 0x39, 8, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8_8_8*/ 10>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_USCALED*/ 0x3A, 8, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8_8_8*/ 10>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_SSCALED*/ 0x3B, 8, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8_8_8*/ 10>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_UINT*/ 0x3C, 8, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8_8_8*/ 10>;
-def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_SINT*/ 0x3D, 8, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8_8_8*/ 10>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_UINT*/ 0x3E, 32, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32*/ 11>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_SINT*/ 0x3F, 32, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32*/ 11>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_FLOAT*/ 0x40, 32, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32*/ 11>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_UNORM*/ 0x41, 16, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_SNORM*/ 0x42, 16, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_USCALED*/ 0x43, 16, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_SSCALED*/ 0x44, 16, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_UINT*/ 0x45, 16, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_SINT*/ 0x46, 16, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_FLOAT*/ 0x47, 16, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16_16_16*/ 12>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_UINT*/ 0x48, 32, 3, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32*/ 13>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_SINT*/ 0x49, 32, 3, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32*/ 13>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_FLOAT*/ 0x4A, 32, 3, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32*/ 13>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_32_UINT*/ 0x4B, 32, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32_32*/ 14>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_32_SINT*/ 0x4C, 32, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32_32*/ 14>;
-def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_32_FLOAT*/ 0x4D, 32, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32_32*/ 14>;
+multiclass Gfx10PlusBufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> {
+ def : Gfx10BufferFormat<f, bpc, numc, nfmt, dfmt>;
+ def : Gfx11PlusBufferFormat<f, bpc, numc, nfmt, dfmt>;
+}
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_UNORM*/ 0x01, 8, 1, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8*/ 1>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_SNORM*/ 0x02, 8, 1, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8*/ 1>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_USCALED*/ 0x03, 8, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8*/ 1>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_SSCALED*/ 0x04, 8, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8*/ 1>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_UINT*/ 0x05, 8, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8*/ 1>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_SINT*/ 0x06, 8, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8*/ 1>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_UNORM*/ 0x07, 16, 1, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16*/ 2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_SNORM*/ 0x08, 16, 1, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16*/ 2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_USCALED*/ 0x09, 16, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16*/ 2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_SSCALED*/ 0x0A, 16, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16*/ 2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_UINT*/ 0x0B, 16, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16*/ 2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_SINT*/ 0x0C, 16, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16*/ 2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_FLOAT*/ 0x0D, 16, 1, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16*/ 2>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_UNORM*/ 0x0E, 8, 2, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8*/ 3>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_SNORM*/ 0x0F, 8, 2, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8*/ 3>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_USCALED*/ 0x10, 8, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8*/ 3>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_SSCALED*/ 0x11, 8, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8*/ 3>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_UINT*/ 0x12, 8, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8*/ 3>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_SINT*/ 0x13, 8, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8*/ 3>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_32_UINT*/ 0x14, 32, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32*/ 4>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_32_SINT*/ 0x15, 32, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32*/ 4>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_32_FLOAT*/ 0x16, 32, 1, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32*/ 4>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_UNORM*/ 0x17, 16, 2, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16*/ 5>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_SNORM*/ 0x18, 16, 2, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16*/ 5>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_USCALED*/ 0x19, 16, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16*/ 5>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_SSCALED*/ 0x1A, 16, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16*/ 5>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_UINT*/ 0x1B, 16, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16*/ 5>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_SINT*/ 0x1C, 16, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16*/ 5>;
+defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_FLOAT*/ 0x1D, 16, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16*/ 5>;
+
+// Buffer formats with equal component sizes (GFX10 only)
+def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_UNORM*/ 0x38, 8, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8_8_8*/ 10>;
+def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_SNORM*/ 0x39, 8, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8_8_8*/ 10>;
+def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_USCALED*/ 0x3A, 8, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8_8_8*/ 10>;
+def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_SSCALED*/ 0x3B, 8, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8_8_8*/ 10>;
+def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_UINT*/ 0x3C, 8, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8_8_8*/ 10>;
+def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_SINT*/ 0x3D, 8, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8_8_8*/ 10>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_UINT*/ 0x3E, 32, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32*/ 11>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_SINT*/ 0x3F, 32, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32*/ 11>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_FLOAT*/ 0x40, 32, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32*/ 11>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_UNORM*/ 0x41, 16, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_SNORM*/ 0x42, 16, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_USCALED*/ 0x43, 16, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_SSCALED*/ 0x44, 16, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_UINT*/ 0x45, 16, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_SINT*/ 0x46, 16, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_FLOAT*/ 0x47, 16, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_32_UINT*/ 0x48, 32, 3, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32*/ 13>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_32_SINT*/ 0x49, 32, 3, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32*/ 13>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_32_FLOAT*/ 0x4A, 32, 3, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32*/ 13>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_32_32_UINT*/ 0x4B, 32, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32_32*/ 14>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_32_32_SINT*/ 0x4C, 32, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32_32*/ 14>;
+def : Gfx10BufferFormat< /*FORMAT_32_32_32_32_FLOAT*/ 0x4D, 32, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32_32*/ 14>;
+
+// Buffer formats with equal component sizes (GFX11 and later)
+def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_UNORM*/ 0x2A, 8, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8_8_8*/ 10>;
+def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_SNORM*/ 0x2B, 8, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8_8_8*/ 10>;
+def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_USCALED*/ 0x2C, 8, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8_8_8*/ 10>;
+def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_SSCALED*/ 0x2D, 8, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8_8_8*/ 10>;
+def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_UINT*/ 0x2E, 8, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8_8_8*/ 10>;
+def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_SINT*/ 0x2F, 8, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8_8_8*/ 10>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_UINT*/ 0x30, 32, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32*/ 11>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_SINT*/ 0x31, 32, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32*/ 11>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_FLOAT*/ 0x32, 32, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32*/ 11>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_UNORM*/ 0x33, 16, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_SNORM*/ 0x34, 16, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_USCALED*/ 0x35, 16, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_SSCALED*/ 0x36, 16, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_UINT*/ 0x37, 16, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_SINT*/ 0x38, 16, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_FLOAT*/ 0x39, 16, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16_16_16*/ 12>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_UINT*/ 0x3A, 32, 3, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32*/ 13>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_SINT*/ 0x3B, 32, 3, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32*/ 13>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_FLOAT*/ 0x3C, 32, 3, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32*/ 13>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_32_UINT*/ 0x3D, 32, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32_32*/ 14>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_32_SINT*/ 0x3E, 32, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32_32*/ 14>;
+def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_32_FLOAT*/ 0x3F, 32, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32_32*/ 14>;
class SourceOfDivergence<Intrinsic intr> {
Intrinsic Intr = intr;
@@ -191,6 +230,8 @@ def : SourceOfDivergence<int_amdgcn_interp_p1>;
def : SourceOfDivergence<int_amdgcn_interp_p2>;
def : SourceOfDivergence<int_amdgcn_interp_p1_f16>;
def : SourceOfDivergence<int_amdgcn_interp_p2_f16>;
+def : SourceOfDivergence<int_amdgcn_lds_direct_load>;
+def : SourceOfDivergence<int_amdgcn_lds_param_load>;
def : SourceOfDivergence<int_amdgcn_mbcnt_hi>;
def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
def : SourceOfDivergence<int_r600_read_tidig_x>;
@@ -205,9 +246,12 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_ds_fadd>;
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
+def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>;
@@ -292,6 +336,16 @@ def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>;
def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_f16>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_f16>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>;
+def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>;
// The dummy boolean output is divergent from the IR's perspective,
// but the mask results are uniform. These produce a divergent and
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
new file mode 100644
index 000000000000..34702ee6623b
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
@@ -0,0 +1,166 @@
+//===- AMDGPUSetWavePriority.cpp - Set wave priority ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Pass to temporarily raise the wave priority beginning the start of
+/// the shader function until its last VMEM instructions to allow younger
+/// waves to issue their VMEM instructions as well.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Allocator.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-set-wave-priority"
+
+namespace {
+
+struct MBBInfo {
+ MBBInfo() = default;
+ bool MayReachVMEMLoad = false;
+};
+
+using MBBInfoSet = DenseMap<const MachineBasicBlock *, MBBInfo>;
+
+class AMDGPUSetWavePriority : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUSetWavePriority() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "Set wave priority"; }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const;
+
+ const SIInstrInfo *TII;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", false,
+ false)
+
+char AMDGPUSetWavePriority::ID = 0;
+
+FunctionPass *llvm::createAMDGPUSetWavePriorityPass() {
+ return new AMDGPUSetWavePriority();
+}
+
+MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF,
+ unsigned priority) const {
+ return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority);
+}
+
+// Checks that for every predecessor Pred that can reach a VMEM load,
+// none of Pred's successors can reach a VMEM load.
+static bool CanLowerPriorityDirectlyInPredecessors(const MachineBasicBlock &MBB,
+ MBBInfoSet &MBBInfos) {
+ for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+ if (!MBBInfos[Pred].MayReachVMEMLoad)
+ continue;
+ for (const MachineBasicBlock *Succ : Pred->successors()) {
+ if (MBBInfos[Succ].MayReachVMEMLoad)
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool isVMEMLoad(const MachineInstr &MI) {
+ return SIInstrInfo::isVMEM(MI) && MI.mayLoad();
+}
+
+bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) {
+ const unsigned HighPriority = 3;
+ const unsigned LowPriority = 0;
+
+ Function &F = MF.getFunction();
+ if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+ return false;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+
+ MBBInfoSet MBBInfos;
+ SmallVector<const MachineBasicBlock *, 16> Worklist;
+ for (MachineBasicBlock &MBB : MF) {
+ if (any_of(MBB, isVMEMLoad))
+ Worklist.push_back(&MBB);
+ }
+
+ // Mark blocks from which control may reach VMEM loads.
+ while (!Worklist.empty()) {
+ const MachineBasicBlock *MBB = Worklist.pop_back_val();
+ MBBInfo &Info = MBBInfos[MBB];
+ if (!Info.MayReachVMEMLoad) {
+ Info.MayReachVMEMLoad = true;
+ Worklist.append(MBB->pred_begin(), MBB->pred_end());
+ }
+ }
+
+ MachineBasicBlock &Entry = MF.front();
+ if (!MBBInfos[&Entry].MayReachVMEMLoad)
+ return false;
+
+ // Raise the priority at the beginning of the shader.
+ MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end();
+ while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator())
+ ++I;
+ Entry.insert(I, BuildSetprioMI(MF, HighPriority));
+
+ // Lower the priority on edges where control leaves blocks from which
+ // VMEM loads are reachable.
+ SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
+ for (MachineBasicBlock &MBB : MF) {
+ if (MBBInfos[&MBB].MayReachVMEMLoad) {
+ if (MBB.succ_empty())
+ PriorityLoweringBlocks.insert(&MBB);
+ continue;
+ }
+
+ if (CanLowerPriorityDirectlyInPredecessors(MBB, MBBInfos)) {
+ for (MachineBasicBlock *Pred : MBB.predecessors()) {
+ if (MBBInfos[Pred].MayReachVMEMLoad)
+ PriorityLoweringBlocks.insert(Pred);
+ }
+ continue;
+ }
+
+ // Where lowering the priority in predecessors is not possible, the
+ // block receiving control either was not part of a loop in the first
+ // place or the loop simplification/canonicalization pass should have
+ // already tried to split the edge and insert a preheader, and if for
+ // whatever reason it failed to do so, then this leaves us with the
+ // only option of lowering the priority within the loop.
+ PriorityLoweringBlocks.insert(&MBB);
+ }
+
+ for (MachineBasicBlock *MBB : PriorityLoweringBlocks) {
+ MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin();
+ while (I != B) {
+ if (isVMEMLoad(*--I)) {
+ ++I;
+ break;
+ }
+ }
+ MBB->insert(I, BuildSetprioMI(MF, LowPriority));
+ }
+
+ return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index e82f9232b114..77816a783630 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -50,11 +50,6 @@ static cl::opt<bool> EnableVGPRIndexMode(
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
cl::init(false));
-static cl::opt<bool> EnableFlatScratch(
- "amdgpu-enable-flat-scratch",
- cl::desc("Use flat scratch instructions"),
- cl::init(false));
-
static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
cl::desc("Enable the use of AA during codegen."),
cl::init(true));
@@ -159,26 +154,7 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
return *this;
}
-AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
- TargetTriple(TT),
- GCN3Encoding(false),
- Has16BitInsts(false),
- HasMadMixInsts(false),
- HasMadMacF32Insts(false),
- HasDsSrc2Insts(false),
- HasSDWA(false),
- HasVOP3PInsts(false),
- HasMulI24(true),
- HasMulU24(true),
- HasSMulHi(false),
- HasInv2PiInlineImm(false),
- HasFminFmaxLegacy(true),
- EnablePromoteAlloca(false),
- HasTrigReducedRange(false),
- MaxWavesPerEU(10),
- LocalMemorySize(0),
- WavefrontSizeLog2(0)
- { }
+AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
const GCNTargetMachine &TM)
@@ -187,120 +163,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
AMDGPUSubtarget(TT),
TargetTriple(TT),
TargetID(*this),
- Gen(INVALID),
InstrItins(getInstrItineraryForCPU(GPU)),
- LDSBankCount(0),
- MaxPrivateElementSize(0),
-
- FastFMAF32(false),
- FastDenormalF32(false),
- HalfRate64Ops(false),
- FullRate64Ops(false),
-
- FlatForGlobal(false),
- AutoWaitcntBeforeBarrier(false),
- UnalignedScratchAccess(false),
- UnalignedAccessMode(false),
-
- HasApertureRegs(false),
- SupportsXNACK(false),
- EnableXNACK(false),
- EnableTgSplit(false),
- EnableCuMode(false),
- TrapHandler(false),
-
- EnableLoadStoreOpt(false),
- EnableUnsafeDSOffsetFolding(false),
- EnableSIScheduler(false),
- EnableDS128(false),
- EnablePRTStrictNull(false),
- DumpCode(false),
-
- FP64(false),
- CIInsts(false),
- GFX8Insts(false),
- GFX9Insts(false),
- GFX90AInsts(false),
- GFX10Insts(false),
- GFX10_3Insts(false),
- GFX7GFX8GFX9Insts(false),
- SGPRInitBug(false),
- NegativeScratchOffsetBug(false),
- NegativeUnalignedScratchOffsetBug(false),
- HasSMemRealTime(false),
- HasIntClamp(false),
- HasFmaMixInsts(false),
- HasMovrel(false),
- HasVGPRIndexMode(false),
- HasScalarStores(false),
- HasScalarAtomics(false),
- HasSDWAOmod(false),
- HasSDWAScalar(false),
- HasSDWASdst(false),
- HasSDWAMac(false),
- HasSDWAOutModsVOPC(false),
- HasDPP(false),
- HasDPP8(false),
- Has64BitDPP(false),
- HasPackedFP32Ops(false),
- HasExtendedImageInsts(false),
- HasR128A16(false),
- HasGFX10A16(false),
- HasG16(false),
- HasNSAEncoding(false),
- NSAMaxSize(0),
- GFX10_AEncoding(false),
- GFX10_BEncoding(false),
- HasDLInsts(false),
- HasDot1Insts(false),
- HasDot2Insts(false),
- HasDot3Insts(false),
- HasDot4Insts(false),
- HasDot5Insts(false),
- HasDot6Insts(false),
- HasDot7Insts(false),
- HasMAIInsts(false),
- HasPkFmacF16Inst(false),
- HasAtomicFaddInsts(false),
- SupportsSRAMECC(false),
- EnableSRAMECC(false),
- HasNoSdstCMPX(false),
- HasVscnt(false),
- HasGetWaveIdInst(false),
- HasSMemTimeInst(false),
- HasShaderCyclesRegister(false),
- HasVOP3Literal(false),
- HasNoDataDepHazard(false),
- FlatAddressSpace(false),
- FlatInstOffsets(false),
- FlatGlobalInsts(false),
- FlatScratchInsts(false),
- ScalarFlatScratchInsts(false),
- HasArchitectedFlatScratch(false),
- AddNoCarryInsts(false),
- HasUnpackedD16VMem(false),
- LDSMisalignedBug(false),
- HasMFMAInlineLiteralBug(false),
- UnalignedBufferAccess(false),
- UnalignedDSAccess(false),
- HasPackedTID(false),
-
- ScalarizeGlobal(false),
-
- HasVcmpxPermlaneHazard(false),
- HasVMEMtoScalarWriteHazard(false),
- HasSMEMtoVectorWriteHazard(false),
- HasInstFwdPrefetchBug(false),
- HasVcmpxExecWARHazard(false),
- HasLdsBranchVmemWARHazard(false),
- HasNSAtoVMEMBug(false),
- HasNSAClauseBug(false),
- HasOffset3fBug(false),
- HasFlatSegmentOffsetBug(false),
- HasImageStoreD16Bug(false),
- HasImageGather4D16Bug(false),
-
- FeatureDisable(false),
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
TLInfo(TM, *this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
@@ -314,11 +177,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
*this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
}
-bool GCNSubtarget::enableFlatScratch() const {
- return flatScratchIsArchitected() ||
- (EnableFlatScratch && hasFlatScratchInsts());
-}
-
unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
if (getGeneration() < GFX10)
return 1;
@@ -326,12 +184,15 @@ unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
switch (Opcode) {
case AMDGPU::V_LSHLREV_B64_e64:
case AMDGPU::V_LSHLREV_B64_gfx10:
+ case AMDGPU::V_LSHLREV_B64_e64_gfx11:
case AMDGPU::V_LSHL_B64_e64:
case AMDGPU::V_LSHRREV_B64_e64:
case AMDGPU::V_LSHRREV_B64_gfx10:
+ case AMDGPU::V_LSHRREV_B64_e64_gfx11:
case AMDGPU::V_LSHR_B64_e64:
case AMDGPU::V_ASHRREV_I64_e64:
case AMDGPU::V_ASHRREV_I64_gfx10:
+ case AMDGPU::V_ASHRREV_I64_e64_gfx11:
case AMDGPU::V_ASHR_I64_e64:
return 1;
}
@@ -658,7 +519,8 @@ unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
return 16;
// Assume all implicit inputs are used by default
- return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
+ unsigned NBytes = (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) ? 256 : 56;
+ return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", NBytes);
}
uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
@@ -673,13 +535,11 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
for (const Argument &Arg : F.args()) {
const bool IsByRef = Arg.hasByRefAttr();
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
- MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
- if (!Alignment)
- Alignment = DL.getABITypeAlign(ArgTy);
-
+ Align Alignment = DL.getValueOrABITypeAlignment(
+ IsByRef ? Arg.getParamAlign() : None, ArgTy);
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
- MaxAlign = max(MaxAlign, Alignment);
+ MaxAlign = std::max(MaxAlign, Alignment);
}
return ExplicitArgBytes;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 7f1b94be4ffe..7400c81effd0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -38,30 +38,32 @@ public:
SEA_ISLANDS = 6,
VOLCANIC_ISLANDS = 7,
GFX9 = 8,
- GFX10 = 9
+ GFX10 = 9,
+ GFX11 = 10
};
private:
Triple TargetTriple;
protected:
- bool GCN3Encoding;
- bool Has16BitInsts;
- bool HasMadMixInsts;
- bool HasMadMacF32Insts;
- bool HasDsSrc2Insts;
- bool HasSDWA;
- bool HasVOP3PInsts;
- bool HasMulI24;
- bool HasMulU24;
- bool HasSMulHi;
- bool HasInv2PiInlineImm;
- bool HasFminFmaxLegacy;
- bool EnablePromoteAlloca;
- bool HasTrigReducedRange;
- unsigned MaxWavesPerEU;
- unsigned LocalMemorySize;
- char WavefrontSizeLog2;
+ bool GCN3Encoding = false;
+ bool Has16BitInsts = false;
+ bool HasTrue16BitInsts = false;
+ bool HasMadMixInsts = false;
+ bool HasMadMacF32Insts = false;
+ bool HasDsSrc2Insts = false;
+ bool HasSDWA = false;
+ bool HasVOP3PInsts = false;
+ bool HasMulI24 = true;
+ bool HasMulU24 = true;
+ bool HasSMulHi = false;
+ bool HasInv2PiInlineImm = false;
+ bool HasFminFmaxLegacy = true;
+ bool EnablePromoteAlloca = false;
+ bool HasTrigReducedRange = false;
+ unsigned MaxWavesPerEU = 10;
+ unsigned LocalMemorySize = 0;
+ char WavefrontSizeLog2 = 0;
public:
AMDGPUSubtarget(const Triple &TT);
@@ -145,6 +147,8 @@ public:
return Has16BitInsts;
}
+ bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
+
bool hasMadMixInsts() const {
return HasMadMixInsts;
}
@@ -267,7 +271,7 @@ public:
/// \p WavefrontSize.
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
- virtual ~AMDGPUSubtarget() {}
+ virtual ~AMDGPUSubtarget() = default;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index a2c61f9da8da..1c6b9d35695a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -16,6 +16,7 @@
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
#include "AMDGPUExportClustering.h"
+#include "AMDGPUIGroupLP.h"
#include "AMDGPUMacroFusion.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
@@ -27,6 +28,7 @@
#include "SIMachineScheduler.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
@@ -56,6 +58,7 @@
#include "llvm/Transforms/Vectorize.h"
using namespace llvm;
+using namespace llvm::PatternMatch;
namespace {
class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
@@ -269,12 +272,22 @@ static cl::opt<bool> EnableSIModeRegisterPass(
cl::init(true),
cl::Hidden);
+// Enable GFX11+ s_delay_alu insertion
+static cl::opt<bool>
+ EnableInsertDelayAlu("amdgpu-enable-delay-alu",
+ cl::desc("Enable s_delay_alu insertion"),
+ cl::init(true), cl::Hidden);
+
// Option is used in lit tests to prevent deadcoding of patterns inspected.
static cl::opt<bool>
EnableDCEInRA("amdgpu-dce-in-ra",
cl::init(true), cl::Hidden,
cl::desc("Enable machine DCE inside regalloc"));
+static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
+ cl::desc("Adjust wave priority"),
+ cl::init(false), cl::Hidden);
+
static cl::opt<bool> EnableScalarIRPasses(
"amdgpu-scalar-ir-passes",
cl::desc("Enable scalar IR passes"),
@@ -330,7 +343,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIOptimizeExecMaskingPreRAPass(*PR);
initializeSIOptimizeVGPRLiveRangePass(*PR);
initializeSILoadStoreOptimizerPass(*PR);
- initializeAMDGPUFixFunctionBitcastsPass(*PR);
initializeAMDGPUCtorDtorLoweringPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
initializeAMDGPUAttributorPass(*PR);
@@ -357,6 +369,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
+ initializeAMDGPUReleaseVGPRsPass(*PR);
+ initializeAMDGPUInsertDelayAluPass(*PR);
initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
initializeSIModeRegisterPass(*PR);
@@ -390,9 +404,14 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+ const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ if (ST.shouldClusterStores())
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createIGroupLPDAGMutation());
+ DAG->addMutation(createSchedBarrierDAGMutation());
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
@@ -400,9 +419,12 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
static ScheduleDAGInstrs *
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+ const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ if (ST.shouldClusterStores())
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
@@ -413,9 +435,12 @@ static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
static ScheduleDAGInstrs *
createIterativeILPMachineScheduler(MachineSchedContext *C) {
+ const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_ILP);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ if (ST.shouldClusterStores())
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
return DAG;
}
@@ -801,6 +826,23 @@ AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
return std::make_pair(nullptr, -1);
}
+unsigned
+AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
+ switch (Kind) {
+ case PseudoSourceValue::Stack:
+ case PseudoSourceValue::FixedStack:
+ return AMDGPUAS::PRIVATE_ADDRESS;
+ case PseudoSourceValue::ConstantPool:
+ case PseudoSourceValue::GOT:
+ case PseudoSourceValue::JumpTable:
+ case PseudoSourceValue::GlobalValueCallEntry:
+ case PseudoSourceValue::ExternalSymbolCallEntry:
+ case PseudoSourceValue::TargetCustom:
+ return AMDGPUAS::CONSTANT_ADDRESS;
+ }
+ return AMDGPUAS::FLAT_ADDRESS;
+}
+
//===----------------------------------------------------------------------===//
// GCN Target Machine (SI+)
//===----------------------------------------------------------------------===//
@@ -836,7 +878,7 @@ GCNTargetMachine::getSubtargetImpl(const Function &F) const {
}
TargetTransformInfo
-GCNTargetMachine::getTargetTransformInfo(const Function &F) {
+GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
return TargetTransformInfo(GCNTTIImpl(this, F));
}
@@ -873,7 +915,11 @@ public:
ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ if (ST.shouldClusterStores())
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
+ DAG->addMutation(createIGroupLPDAGMutation());
+ DAG->addMutation(createSchedBarrierDAGMutation());
return DAG;
}
@@ -953,10 +999,6 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAMDGPUPrintfRuntimeBinding());
addPass(createAMDGPUCtorDtorLoweringPass());
- // This must occur before inlining, as the inliner will not look through
- // bitcast calls.
- addPass(createAMDGPUFixFunctionBitcastsPass());
-
// A call to propagate attributes pass in the backend in case opt was not run.
addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
@@ -967,7 +1009,7 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAlwaysInlinerLegacyPass());
// We need to add the barrier noop pass, otherwise adding the function
// inlining pass will cause all of the PassConfigs passes to be run
- // one function at a time, which means if we have a nodule with two
+ // one function at a time, which means if we have a module with two
// functions, then we will generate code for the first function
// without ever running any passes on the second.
addPass(createBarrierNoopPass());
@@ -1079,8 +1121,11 @@ bool AMDGPUPassConfig::addGCPasses() {
llvm::ScheduleDAGInstrs *
AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
+ const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG = createGenericSchedLive(C);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ if (ST.shouldClusterStores())
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
@@ -1363,6 +1408,8 @@ void GCNPassConfig::addPreEmitPass() {
addPass(&SIInsertHardClausesID);
addPass(&SILateBranchLoweringPassID);
+ if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
+ addPass(createAMDGPUSetWavePriorityPass());
if (getOptLevel() > CodeGenOpt::None)
addPass(&SIPreEmitPeepholeID);
// The hazard recognizer that runs as part of the post-ra scheduler does not
@@ -1374,6 +1421,13 @@ void GCNPassConfig::addPreEmitPass() {
// Here we add a stand-alone hazard recognizer pass which can handle all
// cases.
addPass(&PostRAHazardRecognizerID);
+
+ if (getOptLevel() > CodeGenOpt::Less)
+ addPass(&AMDGPUReleaseVGPRsID);
+
+ if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
+ addPass(&AMDGPUInsertDelayAluID);
+
addPass(&BranchRelaxationPassID);
}
@@ -1396,7 +1450,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
SMDiagnostic &Error, SMRange &SourceRange) const {
const yaml::SIMachineFunctionInfo &YamlMFI =
- reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
+ static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
MachineFunction &MF = PFS.MF;
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -1420,6 +1474,14 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
return false;
};
+ auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
+ Register &RegVal) {
+ return !RegName.Value.empty() && parseRegister(RegName, RegVal);
+ };
+
+ if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
+ return true;
+
auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
// Create a diagnostic for a the register string literal.
const MemoryBuffer &Buffer =
@@ -1452,6 +1514,14 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
}
+ for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
+ Register ParsedReg;
+ if (parseRegister(YamlReg, ParsedReg))
+ return true;
+
+ MFI->reserveWWMRegister(ParsedReg);
+ }
+
auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
const TargetRegisterClass &RC,
ArgDescriptor &Arg, unsigned UserSGPRs,
@@ -1473,7 +1543,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
Arg = ArgDescriptor::createStack(A->StackOffset);
// Check and apply the optional mask.
if (A->Mask)
- Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
+ Arg = ArgDescriptor::createArg(Arg, *A->Mask);
MFI->NumUserSGPRs += UserSGPRs;
MFI->NumSystemSGPRs += SystemSGPRs;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index dd3676f3b707..567cc9d610d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// The AMDGPU TargetMachine interface definition for hw codgen targets.
+/// The AMDGPU TargetMachine interface definition for hw codegen targets.
//
//===----------------------------------------------------------------------===//
@@ -64,6 +64,8 @@ public:
std::pair<const Value *, unsigned>
getPredicatedAddrSpace(const Value *V) const override;
+
+ unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override;
};
//===----------------------------------------------------------------------===//
@@ -84,7 +86,7 @@ public:
const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override;
- TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+ TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
bool useIPRA() const override {
return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a8df7789c8a1..a79cd2e9499e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -288,33 +288,21 @@ GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()),
ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
TLI(ST->getTargetLowering()), CommonTTI(TM, F),
- IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
- MaxVGPRs(ST->getMaxNumVGPRs(
- std::max(ST->getWavesPerEU(F).first,
- ST->getWavesPerEUForWorkGroup(
- ST->getFlatWorkGroupSizes(F).second)))) {
+ IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
AMDGPU::SIModeRegisterDefaults Mode(F);
HasFP32Denormals = Mode.allFP32Denormals();
HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
}
-unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
- // The concept of vector registers doesn't really exist. Some packed vector
- // operations operate on the normal 32-bit registers.
- return MaxVGPRs;
-}
+unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
+ // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
+ // registers. See getRegisterClassForType for the implementation.
+ // In this case vector registers are not vector in terms of
+ // VGPRs, but those which can hold multiple values.
-unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
// This is really the number of registers to fill when vectorizing /
// interleaving loops, so we lie to avoid trying to use all registers.
- return getHardwareNumberOfRegisters(Vec) >> 3;
-}
-
-unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
- const SIRegisterInfo *TRI = ST->getRegisterInfo();
- const TargetRegisterClass *RC = TRI->getRegClass(RCID);
- unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
- return getHardwareNumberOfRegisters(false) / NumVGPRs;
+ return 4;
}
TypeSize
@@ -410,11 +398,14 @@ bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
// unaligned access is legal?
//
// FIXME: This could use fine tuning and microbenchmarks.
-Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
- unsigned SrcAddrSpace,
- unsigned DestAddrSpace,
- unsigned SrcAlign,
- unsigned DestAlign) const {
+Type *GCNTTIImpl::getMemcpyLoopLoweringType(
+ LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
+ unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+ Optional<uint32_t> AtomicElementSize) const {
+
+ if (AtomicElementSize)
+ return Type::getIntNTy(Context, *AtomicElementSize * 8);
+
unsigned MinAlign = std::min(SrcAlign, DestAlign);
// A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
@@ -439,11 +430,17 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
}
void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
- SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
- unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
- unsigned SrcAlign, unsigned DestAlign) const {
+ SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
+ unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
+ unsigned SrcAlign, unsigned DestAlign,
+ Optional<uint32_t> AtomicCpySize) const {
assert(RemainingBytes < 16);
+ if (AtomicCpySize)
+ BaseT::getMemcpyLoopResidualLoweringType(
+ OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
+ DestAlign, AtomicCpySize);
+
unsigned MinAlign = std::min(SrcAlign, DestAlign);
if (MinAlign != 2) {
@@ -1042,7 +1039,8 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *VT, ArrayRef<int> Mask,
- int Index, VectorType *SubTp) {
+ int Index, VectorType *SubTp,
+ ArrayRef<const Value *> Args) {
Kind = improveShuffleKindFromMask(Kind, Mask);
if (ST->hasVOP3PInsts()) {
if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index e901b5c5747d..f2260c31e678 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -68,7 +68,6 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
bool IsGraphics;
bool HasFP32Denormals;
bool HasFP64FP16Denormals;
- unsigned MaxVGPRs;
static const FeatureBitset InlineFeatureIgnoreList;
@@ -113,8 +112,6 @@ public:
return TTI::PSK_FastHardware;
}
- unsigned getHardwareNumberOfRegisters(bool Vector) const;
- unsigned getNumberOfRegisters(bool Vector) const;
unsigned getNumberOfRegisters(unsigned RCID) const;
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
unsigned getMinVectorRegisterBitWidth() const;
@@ -135,15 +132,14 @@ public:
unsigned AddrSpace) const;
Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
unsigned SrcAddrSpace, unsigned DestAddrSpace,
- unsigned SrcAlign, unsigned DestAlign) const;
+ unsigned SrcAlign, unsigned DestAlign,
+ Optional<uint32_t> AtomicElementSize) const;
- void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
- LLVMContext &Context,
- unsigned RemainingBytes,
- unsigned SrcAddrSpace,
- unsigned DestAddrSpace,
- unsigned SrcAlign,
- unsigned DestAlign) const;
+ void getMemcpyLoopResidualLoweringType(
+ SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
+ unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
+ unsigned SrcAlign, unsigned DestAlign,
+ Optional<uint32_t> AtomicCpySize) const;
unsigned getMaxInterleaveFactor(unsigned VF);
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
@@ -201,7 +197,8 @@ public:
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp);
+ VectorType *SubTp,
+ ArrayRef<const Value *> Args = None);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h b/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
index 654153ea5151..8e5f966b7c6c 100644
--- a/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -142,7 +142,7 @@ enum amd_code_property_mask_t {
/// is provided to the finalizer when it is invoked and is recorded
/// here. The hardware will interleave the memory requests of each
/// lane of a wavefront by this element size to ensure each
- /// work-item gets a distinct memory memory location. Therefore, the
+ /// work-item gets a distinct memory location. Therefore, the
/// finalizer ensures that all load and store operations done to
/// private memory do not exceed this size. For example, if the
/// element size is 4 (32-bits or dword) and a 64-bit value must be
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index ffe626513d47..e12d0ffef35c 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -20,10 +20,13 @@
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCAsmParser.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
@@ -33,6 +36,7 @@
#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/MachineValueType.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/TargetParser.h"
using namespace llvm;
@@ -120,12 +124,6 @@ public:
ImmTyD16,
ImmTyClampSI,
ImmTyOModSI,
- ImmTyDPP8,
- ImmTyDppCtrl,
- ImmTyDppRowMask,
- ImmTyDppBankMask,
- ImmTyDppBoundCtrl,
- ImmTyDppFi,
ImmTySdwaDstSel,
ImmTySdwaSrc0Sel,
ImmTySdwaSrc1Sel,
@@ -151,6 +149,12 @@ public:
ImmTyOpSelHi,
ImmTyNegLo,
ImmTyNegHi,
+ ImmTyDPP8,
+ ImmTyDppCtrl,
+ ImmTyDppRowMask,
+ ImmTyDppBankMask,
+ ImmTyDppBoundCtrl,
+ ImmTyDppFi,
ImmTySwizzle,
ImmTyGprIdxMode,
ImmTyHigh,
@@ -158,6 +162,8 @@ public:
ImmTyCBSZ,
ImmTyABID,
ImmTyEndpgm,
+ ImmTyWaitVDST,
+ ImmTyWaitEXP,
};
enum ImmKindTy {
@@ -262,6 +268,14 @@ public:
return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i32);
}
+ bool isRegOrInlineImmWithInt16InputMods() const {
+ return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::i16);
+ }
+
+ bool isRegOrInlineImmWithInt32InputMods() const {
+ return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::i32);
+ }
+
bool isRegOrImmWithInt64InputMods() const {
return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::i64);
}
@@ -278,6 +292,15 @@ public:
return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64);
}
+ bool isRegOrInlineImmWithFP16InputMods() const {
+ return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f16);
+ }
+
+ bool isRegOrInlineImmWithFP32InputMods() const {
+ return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f32);
+ }
+
+
bool isVReg() const {
return isRegClass(AMDGPU::VGPR_32RegClassID) ||
isRegClass(AMDGPU::VReg_64RegClassID) ||
@@ -815,6 +838,8 @@ public:
}
bool isSWaitCnt() const;
+ bool isDepCtr() const;
+ bool isSDelayAlu() const;
bool isHwreg() const;
bool isSendMsg() const;
bool isSwizzle() const;
@@ -830,6 +855,8 @@ public:
bool isS16Imm() const;
bool isU16Imm() const;
bool isEndpgm() const;
+ bool isWaitVDST() const;
+ bool isWaitEXP() const;
StringRef getExpressionAsToken() const {
assert(isExpr());
@@ -1037,6 +1064,8 @@ public:
case ImmTyCBSZ: OS << "CBSZ"; break;
case ImmTyABID: OS << "ABID"; break;
case ImmTyEndpgm: OS << "Endpgm"; break;
+ case ImmTyWaitVDST: OS << "WaitVDST"; break;
+ case ImmTyWaitEXP: OS << "WaitEXP"; break;
}
}
@@ -1123,7 +1152,9 @@ raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) {
class KernelScopeInfo {
int SgprIndexUnusedMin = -1;
int VgprIndexUnusedMin = -1;
+ int AgprIndexUnusedMin = -1;
MCContext *Ctx = nullptr;
+ MCSubtargetInfo const *MSTI = nullptr;
void usesSgprAt(int i) {
if (i >= SgprIndexUnusedMin) {
@@ -1142,7 +1173,31 @@ class KernelScopeInfo {
if (Ctx) {
MCSymbol* const Sym =
Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
- Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx));
+ int totalVGPR = getTotalNumVGPRs(isGFX90A(*MSTI), AgprIndexUnusedMin,
+ VgprIndexUnusedMin);
+ Sym->setVariableValue(MCConstantExpr::create(totalVGPR, *Ctx));
+ }
+ }
+ }
+
+ void usesAgprAt(int i) {
+ // Instruction will error in AMDGPUAsmParser::MatchAndEmitInstruction
+ if (!hasMAIInsts(*MSTI))
+ return;
+
+ if (i >= AgprIndexUnusedMin) {
+ AgprIndexUnusedMin = ++i;
+ if (Ctx) {
+ MCSymbol* const Sym =
+ Ctx->getOrCreateSymbol(Twine(".kernel.agpr_count"));
+ Sym->setVariableValue(MCConstantExpr::create(AgprIndexUnusedMin, *Ctx));
+
+ // Also update vgpr_count (dependent on agpr_count for gfx908/gfx90a)
+ MCSymbol* const vSym =
+ Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
+ int totalVGPR = getTotalNumVGPRs(isGFX90A(*MSTI), AgprIndexUnusedMin,
+ VgprIndexUnusedMin);
+ vSym->setVariableValue(MCConstantExpr::create(totalVGPR, *Ctx));
}
}
}
@@ -1152,16 +1207,29 @@ public:
void initialize(MCContext &Context) {
Ctx = &Context;
+ MSTI = Ctx->getSubtargetInfo();
+
usesSgprAt(SgprIndexUnusedMin = -1);
usesVgprAt(VgprIndexUnusedMin = -1);
+ if (hasMAIInsts(*MSTI)) {
+ usesAgprAt(AgprIndexUnusedMin = -1);
+ }
}
- void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) {
+ void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex,
+ unsigned RegWidth) {
switch (RegKind) {
- case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break;
- case IS_AGPR: // fall through
- case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break;
- default: break;
+ case IS_SGPR:
+ usesSgprAt(DwordRegIndex + divideCeil(RegWidth, 32) - 1);
+ break;
+ case IS_AGPR:
+ usesAgprAt(DwordRegIndex + divideCeil(RegWidth, 32) - 1);
+ break;
+ case IS_VGPR:
+ usesVgprAt(DwordRegIndex + divideCeil(RegWidth, 32) - 1);
+ break;
+ default:
+ break;
}
}
};
@@ -1353,10 +1421,15 @@ public:
return AMDGPU::isGFX9(getSTI());
}
+ // TODO: isGFX90A is also true for GFX940. We need to clean it.
bool isGFX90A() const {
return AMDGPU::isGFX90A(getSTI());
}
+ bool isGFX940() const {
+ return AMDGPU::isGFX940(getSTI());
+ }
+
bool isGFX9Plus() const {
return AMDGPU::isGFX9Plus(getSTI());
}
@@ -1367,6 +1440,14 @@ public:
bool isGFX10Plus() const { return AMDGPU::isGFX10Plus(getSTI()); }
+ bool isGFX11() const {
+ return AMDGPU::isGFX11(getSTI());
+ }
+
+ bool isGFX11Plus() const {
+ return AMDGPU::isGFX11Plus(getSTI());
+ }
+
bool isGFX10_BEncoding() const {
return AMDGPU::isGFX10_BEncoding(getSTI());
}
@@ -1496,6 +1577,14 @@ public:
bool parseCnt(int64_t &IntVal);
OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands);
+
+ bool parseDepCtr(int64_t &IntVal, unsigned &Mask);
+ void depCtrError(SMLoc Loc, int ErrorId, StringRef DepCtrName);
+ OperandMatchResultTy parseDepCtrOps(OperandVector &Operands);
+
+ bool parseDelay(int64_t &Delay);
+ OperandMatchResultTy parseSDelayAluOps(OperandVector &Operands);
+
OperandMatchResultTy parseHwreg(OperandVector &Operands);
private:
@@ -1522,6 +1611,7 @@ private:
SMLoc getFlatOffsetLoc(const OperandVector &Operands) const;
SMLoc getSMEMOffsetLoc(const OperandVector &Operands) const;
+ SMLoc getBLGPLoc(const OperandVector &Operands) const;
SMLoc getOperandLoc(std::function<bool(const AMDGPUOperand&)> Test,
const OperandVector &Operands) const;
@@ -1540,7 +1630,7 @@ private:
bool validateMIMGAtomicDMask(const MCInst &Inst);
bool validateMIMGGatherDMask(const MCInst &Inst);
bool validateMovrels(const MCInst &Inst, const OperandVector &Operands);
- bool validateMIMGDataSize(const MCInst &Inst);
+ Optional<StringRef> validateMIMGDataSize(const MCInst &Inst);
bool validateMIMGAddrSize(const MCInst &Inst);
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGDim(const MCInst &Inst);
@@ -1553,10 +1643,14 @@ private:
bool validateMFMA(const MCInst &Inst, const OperandVector &Operands);
bool validateAGPRLdSt(const MCInst &Inst) const;
bool validateVGPRAlign(const MCInst &Inst) const;
+ bool validateBLGP(const MCInst &Inst, const OperandVector &Operands);
bool validateGWS(const MCInst &Inst, const OperandVector &Operands);
bool validateDivScale(const MCInst &Inst);
bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands,
const SMLoc &IDLoc);
+ bool validateFlatLdsDMA(const MCInst &Inst, const OperandVector &Operands,
+ const SMLoc &IDLoc);
+ bool validateExeczVcczOperands(const OperandVector &Operands);
Optional<StringRef> validateLdsDirect(const MCInst &Inst);
unsigned getConstantBusLimit(unsigned Opcode) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
@@ -1586,7 +1680,7 @@ private:
bool parseExpr(int64_t &Imm, StringRef Expected = "");
bool parseExpr(OperandVector &Operands);
StringRef getTokenStr() const;
- AsmToken peekToken();
+ AsmToken peekToken(bool ShouldSkipSpace = true);
AsmToken getToken() const;
SMLoc getLoc() const;
void lex();
@@ -1644,10 +1738,12 @@ public:
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
+ void cvtVOPD(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx);
void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands);
+ void cvtVINTERP(MCInst &Inst, const OperandVector &Operands);
void cvtMIMG(MCInst &Inst, const OperandVector &Operands,
bool IsAtomic = false);
@@ -1668,7 +1764,24 @@ public:
AMDGPUOperand::Ptr defaultBoundCtrl() const;
AMDGPUOperand::Ptr defaultFI() const;
void cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8 = false);
- void cvtDPP8(MCInst &Inst, const OperandVector &Operands) { cvtDPP(Inst, Operands, true); }
+ void cvtDPP8(MCInst &Inst, const OperandVector &Operands) {
+ cvtDPP(Inst, Operands, true);
+ }
+ void cvtVOPCNoDstDPP(MCInst &Inst, const OperandVector &Operands,
+ bool IsDPP8 = false);
+ void cvtVOPCNoDstDPP8(MCInst &Inst, const OperandVector &Operands) {
+ cvtVOPCNoDstDPP(Inst, Operands, true);
+ }
+ void cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
+ bool IsDPP8 = false);
+ void cvtVOP3DPP8(MCInst &Inst, const OperandVector &Operands) {
+ cvtVOP3DPP(Inst, Operands, true);
+ }
+ void cvtVOPC64NoDstDPP(MCInst &Inst, const OperandVector &Operands,
+ bool IsDPP8 = false);
+ void cvtVOPC64NoDstDPP8(MCInst &Inst, const OperandVector &Operands) {
+ cvtVOPC64NoDstDPP(Inst, Operands, true);
+ }
OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix,
AMDGPUOperand::ImmTy Type);
@@ -1689,6 +1802,10 @@ public:
OperandMatchResultTy parseEndpgmOp(OperandVector &Operands);
AMDGPUOperand::Ptr defaultEndpgmImmOperands() const;
+
+ AMDGPUOperand::Ptr defaultWaitVDST() const;
+ AMDGPUOperand::Ptr defaultWaitEXP() const;
+ OperandMatchResultTy parseVOPD(OperandVector &Operands);
};
struct OptionalOperand {
@@ -1897,7 +2014,7 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
// We allow fp literals with f16x2 operands assuming that the specified
// literal goes into the lower half and the upper half is zero. We also
- // require that the literal may be losslesly converted to f16.
+ // require that the literal may be losslessly converted to f16.
MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
(type == MVT::v2i16)? MVT::i16 :
(type == MVT::v2f32)? MVT::f32 : type;
@@ -2211,52 +2328,86 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
if (Is == IS_VGPR) {
switch (RegWidth) {
default: return -1;
- case 1: return AMDGPU::VGPR_32RegClassID;
- case 2: return AMDGPU::VReg_64RegClassID;
- case 3: return AMDGPU::VReg_96RegClassID;
- case 4: return AMDGPU::VReg_128RegClassID;
- case 5: return AMDGPU::VReg_160RegClassID;
- case 6: return AMDGPU::VReg_192RegClassID;
- case 7: return AMDGPU::VReg_224RegClassID;
- case 8: return AMDGPU::VReg_256RegClassID;
- case 16: return AMDGPU::VReg_512RegClassID;
- case 32: return AMDGPU::VReg_1024RegClassID;
+ case 32:
+ return AMDGPU::VGPR_32RegClassID;
+ case 64:
+ return AMDGPU::VReg_64RegClassID;
+ case 96:
+ return AMDGPU::VReg_96RegClassID;
+ case 128:
+ return AMDGPU::VReg_128RegClassID;
+ case 160:
+ return AMDGPU::VReg_160RegClassID;
+ case 192:
+ return AMDGPU::VReg_192RegClassID;
+ case 224:
+ return AMDGPU::VReg_224RegClassID;
+ case 256:
+ return AMDGPU::VReg_256RegClassID;
+ case 512:
+ return AMDGPU::VReg_512RegClassID;
+ case 1024:
+ return AMDGPU::VReg_1024RegClassID;
}
} else if (Is == IS_TTMP) {
switch (RegWidth) {
default: return -1;
- case 1: return AMDGPU::TTMP_32RegClassID;
- case 2: return AMDGPU::TTMP_64RegClassID;
- case 4: return AMDGPU::TTMP_128RegClassID;
- case 8: return AMDGPU::TTMP_256RegClassID;
- case 16: return AMDGPU::TTMP_512RegClassID;
+ case 32:
+ return AMDGPU::TTMP_32RegClassID;
+ case 64:
+ return AMDGPU::TTMP_64RegClassID;
+ case 128:
+ return AMDGPU::TTMP_128RegClassID;
+ case 256:
+ return AMDGPU::TTMP_256RegClassID;
+ case 512:
+ return AMDGPU::TTMP_512RegClassID;
}
} else if (Is == IS_SGPR) {
switch (RegWidth) {
default: return -1;
- case 1: return AMDGPU::SGPR_32RegClassID;
- case 2: return AMDGPU::SGPR_64RegClassID;
- case 3: return AMDGPU::SGPR_96RegClassID;
- case 4: return AMDGPU::SGPR_128RegClassID;
- case 5: return AMDGPU::SGPR_160RegClassID;
- case 6: return AMDGPU::SGPR_192RegClassID;
- case 7: return AMDGPU::SGPR_224RegClassID;
- case 8: return AMDGPU::SGPR_256RegClassID;
- case 16: return AMDGPU::SGPR_512RegClassID;
+ case 32:
+ return AMDGPU::SGPR_32RegClassID;
+ case 64:
+ return AMDGPU::SGPR_64RegClassID;
+ case 96:
+ return AMDGPU::SGPR_96RegClassID;
+ case 128:
+ return AMDGPU::SGPR_128RegClassID;
+ case 160:
+ return AMDGPU::SGPR_160RegClassID;
+ case 192:
+ return AMDGPU::SGPR_192RegClassID;
+ case 224:
+ return AMDGPU::SGPR_224RegClassID;
+ case 256:
+ return AMDGPU::SGPR_256RegClassID;
+ case 512:
+ return AMDGPU::SGPR_512RegClassID;
}
} else if (Is == IS_AGPR) {
switch (RegWidth) {
default: return -1;
- case 1: return AMDGPU::AGPR_32RegClassID;
- case 2: return AMDGPU::AReg_64RegClassID;
- case 3: return AMDGPU::AReg_96RegClassID;
- case 4: return AMDGPU::AReg_128RegClassID;
- case 5: return AMDGPU::AReg_160RegClassID;
- case 6: return AMDGPU::AReg_192RegClassID;
- case 7: return AMDGPU::AReg_224RegClassID;
- case 8: return AMDGPU::AReg_256RegClassID;
- case 16: return AMDGPU::AReg_512RegClassID;
- case 32: return AMDGPU::AReg_1024RegClassID;
+ case 32:
+ return AMDGPU::AGPR_32RegClassID;
+ case 64:
+ return AMDGPU::AReg_64RegClassID;
+ case 96:
+ return AMDGPU::AReg_96RegClassID;
+ case 128:
+ return AMDGPU::AReg_128RegClassID;
+ case 160:
+ return AMDGPU::AReg_160RegClassID;
+ case 192:
+ return AMDGPU::AReg_192RegClassID;
+ case 224:
+ return AMDGPU::AReg_224RegClassID;
+ case 256:
+ return AMDGPU::AReg_256RegClassID;
+ case 512:
+ return AMDGPU::AReg_512RegClassID;
+ case 1024:
+ return AMDGPU::AReg_1024RegClassID;
}
}
return -1;
@@ -2343,32 +2494,32 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
case IS_SPECIAL:
if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) {
Reg = AMDGPU::EXEC;
- RegWidth = 2;
+ RegWidth = 64;
return true;
}
if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) {
Reg = AMDGPU::FLAT_SCR;
- RegWidth = 2;
+ RegWidth = 64;
return true;
}
if (Reg == AMDGPU::XNACK_MASK_LO && Reg1 == AMDGPU::XNACK_MASK_HI) {
Reg = AMDGPU::XNACK_MASK;
- RegWidth = 2;
+ RegWidth = 64;
return true;
}
if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) {
Reg = AMDGPU::VCC;
- RegWidth = 2;
+ RegWidth = 64;
return true;
}
if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) {
Reg = AMDGPU::TBA;
- RegWidth = 2;
+ RegWidth = 64;
return true;
}
if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) {
Reg = AMDGPU::TMA;
- RegWidth = 2;
+ RegWidth = 64;
return true;
}
Error(Loc, "register does not fit in the list");
@@ -2377,11 +2528,11 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
case IS_SGPR:
case IS_AGPR:
case IS_TTMP:
- if (Reg1 != Reg + RegWidth) {
+ if (Reg1 != Reg + RegWidth / 32) {
Error(Loc, "registers in a list must have consecutive indices");
return false;
}
- RegWidth++;
+ RegWidth += 32;
return true;
default:
llvm_unreachable("unexpected register kind");
@@ -2470,7 +2621,7 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
// SGPR and TTMP registers must be aligned.
// Max required alignment is 4 dwords.
- AlignSize = std::min(RegWidth, 4u);
+ AlignSize = std::min(RegWidth / 32, 4u);
}
if (RegNum % AlignSize != 0) {
@@ -2495,8 +2646,7 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
return RC.getRegister(RegIdx);
}
-bool
-AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) {
+bool AMDGPUAsmParser::ParseRegRange(unsigned &Num, unsigned &RegWidth) {
int64_t RegLo, RegHi;
if (!skipToken(AsmToken::LBrac, "missing register index"))
return false;
@@ -2534,7 +2684,7 @@ AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) {
}
Num = static_cast<unsigned>(RegLo);
- Width = (RegHi - RegLo) + 1;
+ RegWidth = 32 * ((RegHi - RegLo) + 1);
return true;
}
@@ -2545,7 +2695,7 @@ unsigned AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind,
unsigned Reg = getSpecialRegForName(getTokenStr());
if (Reg) {
RegNum = 0;
- RegWidth = 1;
+ RegWidth = 32;
RegKind = IS_SPECIAL;
Tokens.push_back(getToken());
lex(); // skip register name
@@ -2577,7 +2727,7 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
Error(Loc, "invalid register index");
return AMDGPU::NoRegister;
}
- RegWidth = 1;
+ RegWidth = 32;
} else {
// Range of registers: v[XX:YY]. ":YY" is optional.
if (!ParseRegRange(RegNum, RegWidth))
@@ -2603,7 +2753,7 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
auto Loc = getLoc();
if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth))
return AMDGPU::NoRegister;
- if (RegWidth != 1) {
+ if (RegWidth != 32) {
Error(Loc, "expected a single 32-bit register");
return AMDGPU::NoRegister;
}
@@ -2618,7 +2768,7 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
Tokens)) {
return AMDGPU::NoRegister;
}
- if (NextRegWidth != 1) {
+ if (NextRegWidth != 32) {
Error(Loc, "expected a single 32-bit register");
return AMDGPU::NoRegister;
}
@@ -2721,7 +2871,7 @@ bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
return true;
MCSymbol *Sym = getContext().getOrCreateSymbol(*SymbolName);
- int64_t NewMax = DwordRegIndex + RegWidth - 1;
+ int64_t NewMax = DwordRegIndex + divideCeil(RegWidth, 32) - 1;
int64_t OldCount;
if (!Sym->isVariable())
@@ -2761,7 +2911,8 @@ OperandMatchResultTy
AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) {
// TODO: add syntactic sugar for 1/(2*PI)
- assert(!isRegister());
+ if (isRegister())
+ return MatchOperand_NoMatch;
assert(!isModifier());
const auto& Tok = getToken();
@@ -2927,7 +3078,7 @@ AMDGPUAsmParser::isModifier() {
// v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF
// v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001
// Negative fp literals with preceding "-" are
-// handled likewise for unifomtity
+// handled likewise for uniformity
//
bool
AMDGPUAsmParser::parseSP3NegModifier() {
@@ -3110,7 +3261,8 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
static ArrayRef<unsigned> getAllVariants() {
static const unsigned Variants[] = {
AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,
- AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP
+ AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9,
+ AMDGPUAsmVariants::DPP, AMDGPUAsmVariants::VOP3_DPP
};
return makeArrayRef(Variants);
@@ -3118,6 +3270,10 @@ static ArrayRef<unsigned> getAllVariants() {
// What asm variants we should check
ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
+ if (isForcedDPP() && isForcedVOP3()) {
+ static const unsigned Variants[] = {AMDGPUAsmVariants::VOP3_DPP};
+ return makeArrayRef(Variants);
+ }
if (getForcedEncodingSize() == 32) {
static const unsigned Variants[] = {AMDGPUAsmVariants::DEFAULT};
return makeArrayRef(Variants);
@@ -3143,6 +3299,9 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
}
StringRef AMDGPUAsmParser::getMatchedVariantName() const {
+ if (isForcedDPP() && isForcedVOP3())
+ return "e64_dpp";
+
if (getForcedEncodingSize() == 32)
return "e32";
@@ -3231,10 +3390,13 @@ unsigned AMDGPUAsmParser::getConstantBusLimit(unsigned Opcode) const {
// 64-bit shift instructions can use only one scalar value input
case AMDGPU::V_LSHLREV_B64_e64:
case AMDGPU::V_LSHLREV_B64_gfx10:
+ case AMDGPU::V_LSHLREV_B64_e64_gfx11:
case AMDGPU::V_LSHRREV_B64_e64:
case AMDGPU::V_LSHRREV_B64_gfx10:
+ case AMDGPU::V_LSHRREV_B64_e64_gfx11:
case AMDGPU::V_ASHRREV_I64_e64:
case AMDGPU::V_ASHRREV_I64_gfx10:
+ case AMDGPU::V_ASHRREV_I64_e64_gfx11:
case AMDGPU::V_LSHL_B64_e64:
case AMDGPU::V_LSHR_B64_e64:
case AMDGPU::V_ASHR_I64_e64:
@@ -3305,8 +3467,7 @@ AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst,
// flat_scratch_lo, flat_scratch_hi
// are theoretically valid but they are disabled anyway.
// Note that this code mimics SIInstrInfo::verifyInstruction
- if (!SGPRsUsed.count(LastSGPR)) {
- SGPRsUsed.insert(LastSGPR);
+ if (SGPRsUsed.insert(LastSGPR).second) {
++ConstantBusUseCount;
}
} else { // Expression or a literal
@@ -3369,7 +3530,6 @@ AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst,
assert(DstIdx != -1);
const MCOperand &Dst = Inst.getOperand(DstIdx);
assert(Dst.isReg());
- const unsigned DstReg = mc2PseudoReg(Dst.getReg());
const int SrcIndices[] = { Src0Idx, Src1Idx, Src2Idx };
@@ -3377,8 +3537,8 @@ AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst,
if (SrcIdx == -1) break;
const MCOperand &Src = Inst.getOperand(SrcIdx);
if (Src.isReg()) {
- const unsigned SrcReg = mc2PseudoReg(Src.getReg());
- if (isRegIntersect(DstReg, SrcReg, TRI)) {
+ if (TRI->regsOverlap(Dst.getReg(), Src.getReg())) {
+ const unsigned SrcReg = mc2PseudoReg(Src.getReg());
Error(getRegLoc(SrcReg, Operands),
"destination must be different than all sources");
return false;
@@ -3403,13 +3563,13 @@ bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) {
return true;
}
-bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
+Optional<StringRef> AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);
if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
- return true;
+ return None;
int VDataIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
@@ -3418,7 +3578,7 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
assert(VDataIdx != -1);
if (DMaskIdx == -1 || TFEIdx == -1) // intersect_ray
- return true;
+ return None;
unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx);
unsigned TFESize = (TFEIdx != -1 && Inst.getOperand(TFEIdx).getImm()) ? 1 : 0;
@@ -3426,15 +3586,22 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
if (DMask == 0)
DMask = 1;
+ bool isPackedD16 = false;
unsigned DataSize =
(Desc.TSFlags & SIInstrFlags::Gather4) ? 4 : countPopulation(DMask);
if (hasPackedD16()) {
int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16);
- if (D16Idx >= 0 && Inst.getOperand(D16Idx).getImm())
+ isPackedD16 = D16Idx >= 0;
+ if (isPackedD16 && Inst.getOperand(D16Idx).getImm())
DataSize = (DataSize + 1) / 2;
}
- return (VDataSize / 4) == DataSize + TFESize;
+ if ((VDataSize / 4) == DataSize + TFESize)
+ return None;
+
+ return StringRef(isPackedD16
+ ? "image data size does not match dmask, d16 and tfe"
+ : "image data size does not match dmask and tfe");
}
bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
@@ -3607,7 +3774,7 @@ bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst,
auto Reg = mc2PseudoReg(Src0.getReg());
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
- if (isSGPR(Reg, TRI)) {
+ if (!isGFX90A() && isSGPR(Reg, TRI)) {
Error(getRegLoc(Reg, Operands),
"source operand must be either a VGPR or an inline constant");
return false;
@@ -3641,7 +3808,7 @@ bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst,
if (TRI->getRegClass(Desc.OpInfo[0].RegClass).getSizeInBits() <= 128)
return true;
- if (isRegIntersect(Src2Reg, DstReg, TRI)) {
+ if (TRI->regsOverlap(Src2Reg, DstReg)) {
Error(getRegLoc(mc2PseudoReg(Src2Reg), Operands),
"source 2 operand must not partially overlap with dst");
return false;
@@ -3861,7 +4028,7 @@ Optional<StringRef> AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) {
const auto &Src = Inst.getOperand(SrcIdx);
if (Src.isReg() && Src.getReg() == LDS_DIRECT) {
- if (isGFX90A())
+ if (isGFX90A() || isGFX11Plus())
return StringRef("lds_direct is not supported on this GPU");
if (IsRevOpcode(Opcode) || (Desc.TSFlags & SIInstrFlags::SDWA))
@@ -4009,6 +4176,20 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
if (OpSel & ~3)
return false;
}
+
+ if (isGFX940() && (MII.get(Opc).TSFlags & SIInstrFlags::IsDOT)) {
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ if (OpSelIdx != -1) {
+ if (Inst.getOperand(OpSelIdx).getImm() != 0)
+ return false;
+ }
+ int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi);
+ if (OpSelHiIdx != -1) {
+ if (Inst.getOperand(OpSelHiIdx).getImm() != -1)
+ return false;
+ }
+ }
+
return true;
}
@@ -4179,6 +4360,47 @@ bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const {
return true;
}
+SMLoc AMDGPUAsmParser::getBLGPLoc(const OperandVector &Operands) const {
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+ if (Op.isBLGP())
+ return Op.getStartLoc();
+ }
+ return SMLoc();
+}
+
+bool AMDGPUAsmParser::validateBLGP(const MCInst &Inst,
+ const OperandVector &Operands) {
+ unsigned Opc = Inst.getOpcode();
+ int BlgpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp);
+ if (BlgpIdx == -1)
+ return true;
+ SMLoc BLGPLoc = getBLGPLoc(Operands);
+ if (!BLGPLoc.isValid())
+ return true;
+ bool IsNeg = StringRef(BLGPLoc.getPointer()).startswith("neg:");
+ auto FB = getFeatureBits();
+ bool UsesNeg = false;
+ if (FB[AMDGPU::FeatureGFX940Insts]) {
+ switch (Opc) {
+ case AMDGPU::V_MFMA_F64_16X16X4F64_gfx940_acd:
+ case AMDGPU::V_MFMA_F64_16X16X4F64_gfx940_vcd:
+ case AMDGPU::V_MFMA_F64_4X4X4F64_gfx940_acd:
+ case AMDGPU::V_MFMA_F64_4X4X4F64_gfx940_vcd:
+ UsesNeg = true;
+ }
+ }
+
+ if (IsNeg == UsesNeg)
+ return true;
+
+ Error(BLGPLoc,
+ UsesNeg ? "invalid modifier: blgp is not supported"
+ : "invalid modifier: neg is not supported");
+
+ return false;
+}
+
// gfx90a has an undocumented limitation:
// DS_GWS opcodes must use even aligned registers.
bool AMDGPUAsmParser::validateGWS(const MCInst &Inst,
@@ -4218,13 +4440,19 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
unsigned CPol = Inst.getOperand(CPolPos).getImm();
uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
- if ((TSFlags & (SIInstrFlags::SMRD)) &&
- (CPol & ~(AMDGPU::CPol::GLC | AMDGPU::CPol::DLC))) {
- Error(IDLoc, "invalid cache policy for SMRD instruction");
- return false;
+ if (TSFlags & SIInstrFlags::SMRD) {
+ if (CPol && (isSI() || isCI())) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ Error(S, "cache policy is not supported for SMRD instructions");
+ return false;
+ }
+ if (CPol & ~(AMDGPU::CPol::GLC | AMDGPU::CPol::DLC)) {
+ Error(IDLoc, "invalid cache policy for SMEM instruction");
+ return false;
+ }
}
- if (isGFX90A() && (CPol & CPol::SCC)) {
+ if (isGFX90A() && !isGFX940() && (CPol & CPol::SCC)) {
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
StringRef CStr(S.getPointer());
S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scc")]);
@@ -4237,15 +4465,18 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
if (TSFlags & SIInstrFlags::IsAtomicRet) {
if (!(TSFlags & SIInstrFlags::MIMG) && !(CPol & CPol::GLC)) {
- Error(IDLoc, "instruction must use glc");
+ Error(IDLoc, isGFX940() ? "instruction must use sc0"
+ : "instruction must use glc");
return false;
}
} else {
if (CPol & CPol::GLC) {
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
StringRef CStr(S.getPointer());
- S = SMLoc::getFromPointer(&CStr.data()[CStr.find("glc")]);
- Error(S, "instruction must not use glc");
+ S = SMLoc::getFromPointer(
+ &CStr.data()[CStr.find(isGFX940() ? "sc0" : "glc")]);
+ Error(S, isGFX940() ? "instruction must not use sc0"
+ : "instruction must not use glc");
return false;
}
}
@@ -4253,6 +4484,47 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
return true;
}
+bool AMDGPUAsmParser::validateFlatLdsDMA(const MCInst &Inst,
+ const OperandVector &Operands,
+ const SMLoc &IDLoc) {
+ if (isGFX940())
+ return true;
+
+ uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+ if ((TSFlags & (SIInstrFlags::VALU | SIInstrFlags::FLAT)) !=
+ (SIInstrFlags::VALU | SIInstrFlags::FLAT))
+ return true;
+ // This is FLAT LDS DMA.
+
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyLDS, Operands);
+ StringRef CStr(S.getPointer());
+ if (!CStr.startswith("lds")) {
+ // This is incorrectly selected LDS DMA version of a FLAT load opcode.
+ // And LDS version should have 'lds' modifier, but it follows optional
+ // operands so its absense is ignored by the matcher.
+ Error(IDLoc, "invalid operands for instruction");
+ return false;
+ }
+
+ return true;
+}
+
+bool AMDGPUAsmParser::validateExeczVcczOperands(const OperandVector &Operands) {
+ if (!isGFX11Plus())
+ return true;
+ for (auto &Operand : Operands) {
+ if (!Operand->isReg())
+ continue;
+ unsigned Reg = Operand->getReg();
+ if (Reg == SRC_EXECZ || Reg == SRC_VCCZ) {
+ Error(getRegLoc(Reg, Operands),
+ "execz and vccz are not supported on this GPU");
+ return false;
+ }
+ }
+ return true;
+}
+
bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
const SMLoc &IDLoc,
const OperandVector &Operands) {
@@ -4302,9 +4574,8 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"invalid dim; must be MSAA type");
return false;
}
- if (!validateMIMGDataSize(Inst)) {
- Error(IDLoc,
- "image data size does not match dmask and tfe");
+ if (auto ErrMsg = validateMIMGDataSize(Inst)) {
+ Error(IDLoc, *ErrMsg);
return false;
}
if (!validateMIMGAddrSize(Inst)) {
@@ -4357,6 +4628,10 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
return false;
}
+ if (!validateBLGP(Inst, Operands)) {
+ return false;
+ }
+
if (!validateDivScale(Inst)) {
Error(IDLoc, "ABS not allowed in VOP3B instructions");
return false;
@@ -4364,6 +4639,13 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
if (!validateCoherencyBits(Inst, Operands, IDLoc)) {
return false;
}
+ if (!validateExeczVcczOperands(Operands)) {
+ return false;
+ }
+
+ if (!validateFlatLdsDMA(Inst, Operands, IDLoc)) {
+ return false;
+ }
return true;
}
@@ -4606,6 +4888,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
SMRange VGPRRange;
uint64_t NextFreeVGPR = 0;
uint64_t AccumOffset = 0;
+ uint64_t SharedVGPRCount = 0;
SMRange SGPRRange;
uint64_t NextFreeSGPR = 0;
@@ -4630,9 +4913,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (ID == ".end_amdhsa_kernel")
break;
- if (Seen.find(ID) != Seen.end())
+ if (!Seen.insert(ID).second)
return TokError(".amdhsa_ directives cannot be repeated");
- Seen.insert(ID);
SMLoc ValStart = getLoc();
int64_t IVal;
@@ -4833,6 +5115,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
return Error(IDRange.Start, "directive requires gfx10+", IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FWD_PROGRESS, Val,
ValRange);
+ } else if (ID == ".amdhsa_shared_vgpr_count") {
+ if (IVersion.Major < 10)
+ return Error(IDRange.Start, "directive requires gfx10+", IDRange);
+ SharedVGPRCount = Val;
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3,
+ COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT, Val,
+ ValRange);
} else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
PARSE_BITS_ENTRY(
KD.compute_pgm_rsrc2,
@@ -4922,6 +5211,19 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
(AccumOffset / 4 - 1));
}
+ if (IVersion.Major == 10) {
+ // SharedVGPRCount < 16 checked by PARSE_ENTRY_BITS
+ if (SharedVGPRCount && EnableWavefrontSize32) {
+ return TokError("shared_vgpr_count directive not valid on "
+ "wavefront size 32");
+ }
+ if (SharedVGPRCount * 2 + VGPRBlocks > 63) {
+ return TokError("shared_vgpr_count*2 + "
+ "compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot "
+ "exceed 63\n");
+ }
+ }
+
getTargetStreamer().EmitAmdhsaKernelDescriptor(
getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC,
ReserveFlatScr);
@@ -5253,8 +5555,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
return Error(AlignLoc, "alignment is too large");
}
- if (parseToken(AsmToken::EndOfStatement,
- "unexpected token in '.amdgpu_lds' directive"))
+ if (parseEOL())
return true;
Symbol->redefineIfPossible();
@@ -5313,26 +5614,21 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
unsigned RegNo) {
- for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true);
- R.isValid(); ++R) {
- if (*R == RegNo)
- return isGFX9Plus();
- }
+ if (MRI.regsOverlap(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, RegNo))
+ return isGFX9Plus();
- // GFX10 has 2 more SGPRs 104 and 105.
- for (MCRegAliasIterator R(AMDGPU::SGPR104_SGPR105, &MRI, true);
- R.isValid(); ++R) {
- if (*R == RegNo)
- return hasSGPR104_SGPR105();
- }
+ // GFX10+ has 2 more SGPRs 104 and 105.
+ if (MRI.regsOverlap(AMDGPU::SGPR104_SGPR105, RegNo))
+ return hasSGPR104_SGPR105();
switch (RegNo) {
case AMDGPU::SRC_SHARED_BASE:
case AMDGPU::SRC_SHARED_LIMIT:
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT:
- case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
return isGFX9Plus();
+ case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+ return isGFX9Plus() && !isGFX11Plus();
case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
@@ -5355,7 +5651,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
if (isSI() || isGFX10Plus()) {
// No flat_scr on SI.
- // On GFX10 flat scratch is not a valid register operand and can only be
+ // On GFX10Plus flat scratch is not a valid register operand and can only be
// accessed with s_setreg/s_getreg.
switch (RegNo) {
case AMDGPU::FLAT_SCR:
@@ -5369,11 +5665,8 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
// VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that
// SI/CI have.
- for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true);
- R.isValid(); ++R) {
- if (*R == RegNo)
- return hasSGPR102_SGPR103();
- }
+ if (MRI.regsOverlap(AMDGPU::SGPR102_SGPR103, RegNo))
+ return hasSGPR102_SGPR103();
return true;
}
@@ -5381,8 +5674,13 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
OperandMatchResultTy
AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
OperandMode Mode) {
+ OperandMatchResultTy ResTy = parseVOPD(Operands);
+ if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail ||
+ isToken(AsmToken::EndOfStatement))
+ return ResTy;
+
// Try to parse with a custom parser
- OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+ ResTy = MatchOperandParserImpl(Operands, Mnemonic);
// If we successfully parsed the operand or if there as an error parsing,
// we are done.
@@ -5435,7 +5733,11 @@ StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) {
setForcedDPP(false);
setForcedSDWA(false);
- if (Name.endswith("_e64")) {
+ if (Name.endswith("_e64_dpp")) {
+ setForcedDPP(true);
+ setForcedEncodingSize(64);
+ return Name.substr(0, Name.size() - 8);
+ } else if (Name.endswith("_e64")) {
setForcedEncodingSize(64);
return Name.substr(0, Name.size() - 4);
} else if (Name.endswith("_e32")) {
@@ -5451,11 +5753,20 @@ StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) {
return Name;
}
+static void applyMnemonicAliases(StringRef &Mnemonic,
+ const FeatureBitset &Features,
+ unsigned VariantID);
+
bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
StringRef Name,
SMLoc NameLoc, OperandVector &Operands) {
// Add the instruction mnemonic
Name = parseMnemonicSuffix(Name);
+
+ // If the target architecture uses MnemonicAlias, call it here to parse
+ // operands correctly.
+ applyMnemonicAliases(Name, getAvailableFeatures(), 0);
+
Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc));
bool IsMIMG = Name.startswith("image_");
@@ -5603,7 +5914,24 @@ AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
unsigned CPolOff = 0;
SMLoc S = getLoc();
- if (trySkipId("glc"))
+ StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken();
+ if (isGFX940() && !Mnemo.startswith("s_")) {
+ if (trySkipId("sc0"))
+ CPolOn = AMDGPU::CPol::SC0;
+ else if (trySkipId("nosc0"))
+ CPolOff = AMDGPU::CPol::SC0;
+ else if (trySkipId("nt"))
+ CPolOn = AMDGPU::CPol::NT;
+ else if (trySkipId("nont"))
+ CPolOff = AMDGPU::CPol::NT;
+ else if (trySkipId("sc1"))
+ CPolOn = AMDGPU::CPol::SC1;
+ else if (trySkipId("nosc1"))
+ CPolOff = AMDGPU::CPol::SC1;
+ else
+ return MatchOperand_NoMatch;
+ }
+ else if (trySkipId("glc"))
CPolOn = AMDGPU::CPol::GLC;
else if (trySkipId("noglc"))
CPolOff = AMDGPU::CPol::GLC;
@@ -5809,7 +6137,7 @@ AMDGPUAsmParser::parseSymbolicSplitFormat(StringRef FormatStr,
Nfmt = (Nfmt == NFMT_UNDEF) ? NFMT_DEFAULT : Nfmt;
if (isGFX10Plus()) {
- auto Ufmt = convertDfmtNfmt2Ufmt(Dfmt, Nfmt);
+ auto Ufmt = convertDfmtNfmt2Ufmt(Dfmt, Nfmt, getSTI());
if (Ufmt == UFMT_UNDEF) {
Error(FormatLoc, "unsupported format");
return MatchOperand_ParseFail;
@@ -5828,7 +6156,7 @@ AMDGPUAsmParser::parseSymbolicUnifiedFormat(StringRef FormatStr,
int64_t &Format) {
using namespace llvm::AMDGPU::MTBUFFormat;
- auto Id = getUnifiedFormat(FormatStr);
+ auto Id = getUnifiedFormat(FormatStr, getSTI());
if (Id == UFMT_UNDEF)
return MatchOperand_NoMatch;
@@ -5969,6 +6297,7 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
bool IsGdsHardcoded) {
OptionalImmIndexMap OptionalIdx;
+ AMDGPUOperand::ImmTy OffsetType = AMDGPUOperand::ImmTyOffset;
for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
@@ -5986,13 +6315,10 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
// Handle optional arguments
OptionalIdx[Op.getImmTy()] = i;
- }
- AMDGPUOperand::ImmTy OffsetType =
- (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx10 ||
- Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7 ||
- Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle :
- AMDGPUOperand::ImmTyOffset;
+ if (Op.getImmTy() == AMDGPUOperand::ImmTySwizzle)
+ OffsetType = AMDGPUOperand::ImmTySwizzle;
+ }
addOptionalImmOperand(Inst, Operands, OptionalIdx, OffsetType);
@@ -6034,7 +6360,7 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
continue;
}
- if (Op.isToken() && Op.getToken() == "done")
+ if (Op.isToken() && (Op.getToken() == "done" || Op.getToken() == "row_en"))
continue;
// Handle optional arguments
@@ -6157,11 +6483,179 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
return MatchOperand_Success;
}
+bool AMDGPUAsmParser::parseDelay(int64_t &Delay) {
+ SMLoc FieldLoc = getLoc();
+ StringRef FieldName = getTokenStr();
+ if (!skipToken(AsmToken::Identifier, "expected a field name") ||
+ !skipToken(AsmToken::LParen, "expected a left parenthesis"))
+ return false;
+
+ SMLoc ValueLoc = getLoc();
+ StringRef ValueName = getTokenStr();
+ if (!skipToken(AsmToken::Identifier, "expected a value name") ||
+ !skipToken(AsmToken::RParen, "expected a right parenthesis"))
+ return false;
+
+ unsigned Shift;
+ if (FieldName == "instid0") {
+ Shift = 0;
+ } else if (FieldName == "instskip") {
+ Shift = 4;
+ } else if (FieldName == "instid1") {
+ Shift = 7;
+ } else {
+ Error(FieldLoc, "invalid field name " + FieldName);
+ return false;
+ }
+
+ int Value;
+ if (Shift == 4) {
+ // Parse values for instskip.
+ Value = StringSwitch<int>(ValueName)
+ .Case("SAME", 0)
+ .Case("NEXT", 1)
+ .Case("SKIP_1", 2)
+ .Case("SKIP_2", 3)
+ .Case("SKIP_3", 4)
+ .Case("SKIP_4", 5)
+ .Default(-1);
+ } else {
+ // Parse values for instid0 and instid1.
+ Value = StringSwitch<int>(ValueName)
+ .Case("NO_DEP", 0)
+ .Case("VALU_DEP_1", 1)
+ .Case("VALU_DEP_2", 2)
+ .Case("VALU_DEP_3", 3)
+ .Case("VALU_DEP_4", 4)
+ .Case("TRANS32_DEP_1", 5)
+ .Case("TRANS32_DEP_2", 6)
+ .Case("TRANS32_DEP_3", 7)
+ .Case("FMA_ACCUM_CYCLE_1", 8)
+ .Case("SALU_CYCLE_1", 9)
+ .Case("SALU_CYCLE_2", 10)
+ .Case("SALU_CYCLE_3", 11)
+ .Default(-1);
+ }
+ if (Value < 0) {
+ Error(ValueLoc, "invalid value name " + ValueName);
+ return false;
+ }
+
+ Delay |= Value << Shift;
+ return true;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSDelayAluOps(OperandVector &Operands) {
+ int64_t Delay = 0;
+ SMLoc S = getLoc();
+
+ if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) {
+ do {
+ if (!parseDelay(Delay))
+ return MatchOperand_ParseFail;
+ } while (trySkipToken(AsmToken::Pipe));
+ } else {
+ if (!parseExpr(Delay))
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Delay, S));
+ return MatchOperand_Success;
+}
+
bool
AMDGPUOperand::isSWaitCnt() const {
return isImm();
}
+bool AMDGPUOperand::isSDelayAlu() const { return isImm(); }
+
+//===----------------------------------------------------------------------===//
+// DepCtr
+//===----------------------------------------------------------------------===//
+
+void AMDGPUAsmParser::depCtrError(SMLoc Loc, int ErrorId,
+ StringRef DepCtrName) {
+ switch (ErrorId) {
+ case OPR_ID_UNKNOWN:
+ Error(Loc, Twine("invalid counter name ", DepCtrName));
+ return;
+ case OPR_ID_UNSUPPORTED:
+ Error(Loc, Twine(DepCtrName, " is not supported on this GPU"));
+ return;
+ case OPR_ID_DUPLICATE:
+ Error(Loc, Twine("duplicate counter name ", DepCtrName));
+ return;
+ case OPR_VAL_INVALID:
+ Error(Loc, Twine("invalid value for ", DepCtrName));
+ return;
+ default:
+ assert(false);
+ }
+}
+
+bool AMDGPUAsmParser::parseDepCtr(int64_t &DepCtr, unsigned &UsedOprMask) {
+
+ using namespace llvm::AMDGPU::DepCtr;
+
+ SMLoc DepCtrLoc = getLoc();
+ StringRef DepCtrName = getTokenStr();
+
+ if (!skipToken(AsmToken::Identifier, "expected a counter name") ||
+ !skipToken(AsmToken::LParen, "expected a left parenthesis"))
+ return false;
+
+ int64_t ExprVal;
+ if (!parseExpr(ExprVal))
+ return false;
+
+ unsigned PrevOprMask = UsedOprMask;
+ int CntVal = encodeDepCtr(DepCtrName, ExprVal, UsedOprMask, getSTI());
+
+ if (CntVal < 0) {
+ depCtrError(DepCtrLoc, CntVal, DepCtrName);
+ return false;
+ }
+
+ if (!skipToken(AsmToken::RParen, "expected a closing parenthesis"))
+ return false;
+
+ if (trySkipToken(AsmToken::Amp) || trySkipToken(AsmToken::Comma)) {
+ if (isToken(AsmToken::EndOfStatement)) {
+ Error(getLoc(), "expected a counter name");
+ return false;
+ }
+ }
+
+ unsigned CntValMask = PrevOprMask ^ UsedOprMask;
+ DepCtr = (DepCtr & ~CntValMask) | CntVal;
+ return true;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseDepCtrOps(OperandVector &Operands) {
+ using namespace llvm::AMDGPU::DepCtr;
+
+ int64_t DepCtr = getDefaultDepCtrEncoding(getSTI());
+ SMLoc Loc = getLoc();
+
+ if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) {
+ unsigned UsedOprMask = 0;
+ while (!isToken(AsmToken::EndOfStatement)) {
+ if (!parseDepCtr(DepCtr, UsedOprMask))
+ return MatchOperand_ParseFail;
+ }
+ } else {
+ if (!parseExpr(DepCtr))
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, DepCtr, Loc));
+ return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isDepCtr() const { return isS16Imm(); }
+
//===----------------------------------------------------------------------===//
// hwreg
//===----------------------------------------------------------------------===//
@@ -6175,7 +6669,7 @@ AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg,
// The register may be specified by name or using a numeric code
HwReg.Loc = getLoc();
if (isToken(AsmToken::Identifier) &&
- (HwReg.Id = getHwregId(getTokenStr())) >= 0) {
+ (HwReg.Id = getHwregId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
HwReg.IsSymbolic = true;
lex(); // skip register name
} else if (!parseExpr(HwReg.Id, "a register name")) {
@@ -6208,15 +6702,18 @@ AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg,
using namespace llvm::AMDGPU::Hwreg;
- if (HwReg.IsSymbolic && !isValidHwreg(HwReg.Id, getSTI())) {
- Error(HwReg.Loc,
- "specified hardware register is not supported on this GPU");
- return false;
- }
- if (!isValidHwreg(HwReg.Id)) {
- Error(HwReg.Loc,
- "invalid code of hardware register: only 6-bit values are legal");
- return false;
+ if (HwReg.IsSymbolic) {
+ if (HwReg.Id == OPR_ID_UNSUPPORTED) {
+ Error(HwReg.Loc,
+ "specified hardware register is not supported on this GPU");
+ return false;
+ }
+ } else {
+ if (!isValidHwreg(HwReg.Id)) {
+ Error(HwReg.Loc,
+ "invalid code of hardware register: only 6-bit values are legal");
+ return false;
+ }
}
if (!isValidHwregOffset(Offset.Id)) {
Error(Offset.Loc, "invalid bit offset: only 5-bit values are legal");
@@ -6238,7 +6735,7 @@ AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
SMLoc Loc = getLoc();
if (trySkipId("hwreg", AsmToken::LParen)) {
- OperandInfoTy HwReg(ID_UNKNOWN_);
+ OperandInfoTy HwReg(OPR_ID_UNKNOWN);
OperandInfoTy Offset(OFFSET_DEFAULT_);
OperandInfoTy Width(WIDTH_DEFAULT_);
if (parseHwregBody(HwReg, Offset, Width) &&
@@ -6275,7 +6772,8 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
using namespace llvm::AMDGPU::SendMsg;
Msg.Loc = getLoc();
- if (isToken(AsmToken::Identifier) && (Msg.Id = getMsgId(getTokenStr())) >= 0) {
+ if (isToken(AsmToken::Identifier) &&
+ (Msg.Id = getMsgId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
Msg.IsSymbolic = true;
lex(); // skip message name
} else if (!parseExpr(Msg.Id, "a message name")) {
@@ -6310,15 +6808,22 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
using namespace llvm::AMDGPU::SendMsg;
// Validation strictness depends on whether message is specified
- // in a symbolc or in a numeric form. In the latter case
+ // in a symbolic or in a numeric form. In the latter case
// only encoding possibility is checked.
bool Strict = Msg.IsSymbolic;
- if (!isValidMsgId(Msg.Id, getSTI(), Strict)) {
- Error(Msg.Loc, "invalid message id");
- return false;
+ if (Strict) {
+ if (Msg.Id == OPR_ID_UNSUPPORTED) {
+ Error(Msg.Loc, "specified message id is not supported on this GPU");
+ return false;
+ }
+ } else {
+ if (!isValidMsgId(Msg.Id, getSTI())) {
+ Error(Msg.Loc, "invalid message id");
+ return false;
+ }
}
- if (Strict && (msgRequiresOp(Msg.Id) != Op.IsDefined)) {
+ if (Strict && (msgRequiresOp(Msg.Id, getSTI()) != Op.IsDefined)) {
if (Op.IsDefined) {
Error(Op.Loc, "message does not support operations");
} else {
@@ -6330,7 +6835,8 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
Error(Op.Loc, "invalid operation id");
return false;
}
- if (Strict && !msgSupportsStream(Msg.Id, Op.Id) && Stream.IsDefined) {
+ if (Strict && !msgSupportsStream(Msg.Id, Op.Id, getSTI()) &&
+ Stream.IsDefined) {
Error(Stream.Loc, "message operation does not support streams");
return false;
}
@@ -6349,7 +6855,7 @@ AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
SMLoc Loc = getLoc();
if (trySkipId("sendmsg", AsmToken::LParen)) {
- OperandInfoTy Msg(ID_UNKNOWN_);
+ OperandInfoTy Msg(OPR_ID_UNKNOWN);
OperandInfoTy Op(OP_NONE_);
OperandInfoTy Stream(STREAM_ID_NONE_);
if (parseSendMsgBody(Msg, Op, Stream) &&
@@ -6610,9 +7116,10 @@ AMDGPUAsmParser::getToken() const {
return Parser.getTok();
}
-AsmToken
-AMDGPUAsmParser::peekToken() {
- return isToken(AsmToken::EndOfStatement) ? getToken() : getLexer().peekTok();
+AsmToken AMDGPUAsmParser::peekToken(bool ShouldSkipSpace) {
+ return isToken(AsmToken::EndOfStatement)
+ ? getToken()
+ : getLexer().peekTok(ShouldSkipSpace);
}
void
@@ -7078,8 +7585,6 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
const OperandVector &Operands,
bool IsAtomic,
bool IsLds) {
- bool IsLdsOpcode = IsLds;
- bool HasLdsModifier = false;
OptionalImmIndexMap OptionalIdx;
unsigned FirstOperandIdx = 1;
bool IsAtomicReturn = false;
@@ -7123,8 +7628,6 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
continue;
}
- HasLdsModifier |= Op.isLDS();
-
// Handle tokens like 'offen' which are sometimes hard-coded into the
// asm string. There are no MCInst operands for these.
if (Op.isToken()) {
@@ -7136,25 +7639,10 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
OptionalIdx[Op.getImmTy()] = i;
}
- // This is a workaround for an llvm quirk which may result in an
- // incorrect instruction selection. Lds and non-lds versions of
- // MUBUF instructions are identical except that lds versions
- // have mandatory 'lds' modifier. However this modifier follows
- // optional modifiers and llvm asm matcher regards this 'lds'
- // modifier as an optional one. As a result, an lds version
- // of opcode may be selected even if it has no 'lds' modifier.
- if (IsLdsOpcode && !HasLdsModifier) {
- int NoLdsOpcode = AMDGPU::getMUBUFNoLdsInst(Inst.getOpcode());
- if (NoLdsOpcode != -1) { // Got lds version - correct it.
- Inst.setOpcode(NoLdsOpcode);
- IsLdsOpcode = false;
- }
- }
-
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
- if (!IsLdsOpcode) { // tfe is not legal with lds opcodes
+ if (!IsLds) { // tfe is not legal with lds opcodes
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
}
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ);
@@ -7327,7 +7815,8 @@ bool AMDGPUOperand::isSMRDOffset8() const {
}
bool AMDGPUOperand::isSMEMOffset() const {
- return isImm(); // Offset range is checked later by validator.
+ return isImmTy(ImmTyNone) ||
+ isImmTy(ImmTyOffset); // Offset range is checked later by validator.
}
bool AMDGPUOperand::isSMRDLiteralOffset() const {
@@ -7415,10 +7904,6 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
{"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr},
{"dim", AMDGPUOperand::ImmTyDim, false, nullptr},
- {"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
- {"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
- {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl},
- {"fi", AMDGPUOperand::ImmTyDppFi, false, nullptr},
{"dst_sel", AMDGPUOperand::ImmTySdwaDstSel, false, nullptr},
{"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr},
{"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
@@ -7429,9 +7914,17 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr},
{"neg_lo", AMDGPUOperand::ImmTyNegLo, false, nullptr},
{"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr},
+ {"dpp8", AMDGPUOperand::ImmTyDPP8, false, nullptr},
+ {"dpp_ctrl", AMDGPUOperand::ImmTyDppCtrl, false, nullptr},
+ {"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
+ {"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
+ {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl},
+ {"fi", AMDGPUOperand::ImmTyDppFi, false, nullptr},
{"blgp", AMDGPUOperand::ImmTyBLGP, false, nullptr},
{"cbsz", AMDGPUOperand::ImmTyCBSZ, false, nullptr},
- {"abid", AMDGPUOperand::ImmTyABID, false, nullptr}
+ {"abid", AMDGPUOperand::ImmTyABID, false, nullptr},
+ {"wait_vdst", AMDGPUOperand::ImmTyWaitVDST, false, nullptr},
+ {"wait_exp", AMDGPUOperand::ImmTyWaitEXP, false, nullptr}
};
void AMDGPUAsmParser::onBeginOfFile() {
@@ -7497,8 +7990,17 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands)
res = parseDim(Operands);
} else if (Op.Type == AMDGPUOperand::ImmTyCPol) {
res = parseCPol(Operands);
+ } else if (Op.Type == AMDGPUOperand::ImmTyDPP8) {
+ res = parseDPP8(Operands);
+ } else if (Op.Type == AMDGPUOperand::ImmTyDppCtrl) {
+ res = parseDPPCtrl(Operands);
} else {
res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
+ if (Op.Type == AMDGPUOperand::ImmTyBLGP && res == MatchOperand_NoMatch) {
+ res = parseOperandArrayWithPrefix("neg", Operands,
+ AMDGPUOperand::ImmTyBLGP,
+ nullptr);
+ }
}
if (res != MatchOperand_NoMatch) {
return res;
@@ -7596,6 +8098,66 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands)
}
}
+void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands)
+{
+ OptionalImmIndexMap OptionalIdx;
+ unsigned Opc = Inst.getOpcode();
+
+ unsigned I = 1;
+ const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+ for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+ ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+ }
+
+ for (unsigned E = Operands.size(); I != E; ++I) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+ if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+ } else if (Op.isImmModifier()) {
+ OptionalIdx[Op.getImmTy()] = I;
+ } else {
+ llvm_unreachable("unhandled operand type");
+ }
+ }
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
+
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ if (OpSelIdx != -1)
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOpSel);
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyWaitEXP);
+
+ if (OpSelIdx == -1)
+ return;
+
+ const int Ops[] = { AMDGPU::OpName::src0,
+ AMDGPU::OpName::src1,
+ AMDGPU::OpName::src2 };
+ const int ModOps[] = { AMDGPU::OpName::src0_modifiers,
+ AMDGPU::OpName::src1_modifiers,
+ AMDGPU::OpName::src2_modifiers };
+
+ unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+
+ for (int J = 0; J < 3; ++J) {
+ int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]);
+ if (OpIdx == -1)
+ break;
+
+ int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+ uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
+
+ if ((OpSel & (1 << J)) != 0)
+ ModVal |= SISrcMods::OP_SEL_0;
+ if (ModOps[J] == AMDGPU::OpName::src0_modifiers &&
+ (OpSel & (1 << 3)) != 0)
+ ModVal |= SISrcMods::DST_OP_SEL;
+
+ Inst.getOperand(ModIdx).setImm(ModVal);
+ }
+}
+
void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx) {
unsigned Opc = Inst.getOpcode();
@@ -7652,9 +8214,12 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
Opc == AMDGPU::V_MAC_F16_e64_vi ||
Opc == AMDGPU::V_FMAC_F64_e64_gfx90a ||
Opc == AMDGPU::V_FMAC_F32_e64_gfx10 ||
+ Opc == AMDGPU::V_FMAC_F32_e64_gfx11 ||
Opc == AMDGPU::V_FMAC_F32_e64_vi ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
- Opc == AMDGPU::V_FMAC_F16_e64_gfx10) {
+ Opc == AMDGPU::V_FMAC_DX9_ZERO_F32_e64_gfx11 ||
+ Opc == AMDGPU::V_FMAC_F16_e64_gfx10 ||
+ Opc == AMDGPU::V_FMAC_F16_e64_gfx11) {
auto it = Inst.begin();
std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers));
it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
@@ -7731,6 +8296,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
if (OpIdx == -1)
break;
+ int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+
+ if (ModIdx == -1)
+ continue;
+
uint32_t ModVal = 0;
if ((OpSel & (1 << J)) != 0)
@@ -7745,8 +8315,6 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
if ((NegHi & (1 << J)) != 0)
ModVal |= SISrcMods::NEG_HI;
- int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
-
Inst.getOperand(ModIdx).setImm(Inst.getOperand(ModIdx).getImm() | ModVal);
}
}
@@ -7758,6 +8326,118 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
}
//===----------------------------------------------------------------------===//
+// VOPD
+//===----------------------------------------------------------------------===//
+
+OperandMatchResultTy AMDGPUAsmParser::parseVOPD(OperandVector &Operands) {
+ if (!hasVOPD(getSTI()))
+ return MatchOperand_NoMatch;
+
+ if (isToken(AsmToken::Colon) && peekToken(false).is(AsmToken::Colon)) {
+ SMLoc S = getLoc();
+ lex();
+ lex();
+ Operands.push_back(AMDGPUOperand::CreateToken(this, "::", S));
+ const MCExpr *Expr;
+ if (isToken(AsmToken::Identifier) && !Parser.parseExpression(Expr)) {
+ Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
+ return MatchOperand_Success;
+ }
+ Error(S, "invalid VOPD :: usage");
+ return MatchOperand_ParseFail;
+ }
+ return MatchOperand_NoMatch;
+}
+
+// Create VOPD MCInst operands using parsed assembler operands.
+// Parsed VOPD operands are ordered as follows:
+// OpXMnemo dstX src0X [vsrc1X|imm vsrc1X|vsrc1X imm] '::'
+// OpYMnemo dstY src0Y [vsrc1Y|imm vsrc1Y|vsrc1Y imm]
+// If both OpX and OpY have an imm, the first imm has a different name:
+// OpXMnemo dstX src0X [vsrc1X|immDeferred vsrc1X|vsrc1X immDeferred] '::'
+// OpYMnemo dstY src0Y [vsrc1Y|imm vsrc1Y|vsrc1Y imm]
+// MCInst operands have the following order:
+// dstX, dstY, src0X [, other OpX operands], src0Y [, other OpY operands]
+void AMDGPUAsmParser::cvtVOPD(MCInst &Inst, const OperandVector &Operands) {
+ auto addOp = [&](uint16_t i) { // NOLINT:function pointer
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+ if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
+ return;
+ }
+ if (Op.isImm()) {
+ Op.addImmOperands(Inst, 1);
+ return;
+ }
+ // Handle tokens like 'offen' which are sometimes hard-coded into the
+ // asm string. There are no MCInst operands for these.
+ if (Op.isToken()) {
+ return;
+ }
+ llvm_unreachable("Unhandled operand type in cvtVOPD");
+ };
+
+ // Indices into MCInst.Operands
+ const auto FmamkOpXImmMCIndex = 3; // dstX, dstY, src0X, imm, ...
+ const auto FmaakOpXImmMCIndex = 4; // dstX, dstY, src0X, src1X, imm, ...
+ const auto MinOpYImmMCIndex = 4; // dstX, dstY, src0X, src0Y, imm, ...
+
+ unsigned Opc = Inst.getOpcode();
+ bool HasVsrc1X =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vsrc1X) != -1;
+ bool HasImmX =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::immDeferred) != -1 ||
+ (HasVsrc1X && (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::imm) ==
+ FmamkOpXImmMCIndex ||
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::imm) ==
+ FmaakOpXImmMCIndex));
+
+ bool HasVsrc1Y =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vsrc1Y) != -1;
+ bool HasImmY =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::immDeferred) != -1 ||
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::imm) >=
+ MinOpYImmMCIndex + HasVsrc1X;
+
+ // Indices of parsed operands relative to dst
+ const auto DstIdx = 0;
+ const auto Src0Idx = 1;
+ const auto Vsrc1OrImmIdx = 2;
+
+ const auto OpXOperandsSize = 2 + HasImmX + HasVsrc1X;
+ const auto BridgeTokensSize = 2; // Special VOPD tokens ('::' and OpYMnemo)
+
+ // Offsets into parsed operands
+ const auto OpXFirstOperandOffset = 1;
+ const auto OpYFirstOperandOffset =
+ OpXFirstOperandOffset + OpXOperandsSize + BridgeTokensSize;
+
+ // Order of addOp calls determines MC operand order
+ addOp(OpXFirstOperandOffset + DstIdx); // vdstX
+ addOp(OpYFirstOperandOffset + DstIdx); // vdstY
+
+ addOp(OpXFirstOperandOffset + Src0Idx); // src0X
+ if (HasImmX) {
+ // immX then vsrc1X for fmamk, vsrc1X then immX for fmaak
+ addOp(OpXFirstOperandOffset + Vsrc1OrImmIdx);
+ addOp(OpXFirstOperandOffset + Vsrc1OrImmIdx + 1);
+ } else {
+ if (HasVsrc1X) // all except v_mov
+ addOp(OpXFirstOperandOffset + Vsrc1OrImmIdx); // vsrc1X
+ }
+
+ addOp(OpYFirstOperandOffset + Src0Idx); // src0Y
+ if (HasImmY) {
+ // immY then vsrc1Y for fmamk, vsrc1Y then immY for fmaak
+ addOp(OpYFirstOperandOffset + Vsrc1OrImmIdx);
+ addOp(OpYFirstOperandOffset + Vsrc1OrImmIdx + 1);
+ } else {
+ if (HasVsrc1Y) // all except v_mov
+ addOp(OpYFirstOperandOffset + Vsrc1OrImmIdx); // vsrc1Y
+ }
+}
+
+//===----------------------------------------------------------------------===//
// dpp
//===----------------------------------------------------------------------===//
@@ -8067,6 +8747,88 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppFi);
}
+// Add dummy $old operand
+void AMDGPUAsmParser::cvtVOPC64NoDstDPP(MCInst &Inst,
+ const OperandVector &Operands,
+ bool IsDPP8) {
+ Inst.addOperand(MCOperand::createReg(0));
+ cvtVOP3DPP(Inst, Operands, IsDPP8);
+}
+
+void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) {
+ OptionalImmIndexMap OptionalIdx;
+ unsigned Opc = Inst.getOpcode();
+ bool HasModifiers = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1;
+ unsigned I = 1;
+ const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+ for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+ ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+ }
+
+ int Fi = 0;
+ for (unsigned E = Operands.size(); I != E; ++I) {
+ auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
+ MCOI::TIED_TO);
+ if (TiedTo != -1) {
+ assert((unsigned)TiedTo < Inst.getNumOperands());
+ // handle tied old or src2 for MAC instructions
+ Inst.addOperand(Inst.getOperand(TiedTo));
+ }
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+ // Add the register arguments
+ if (IsDPP8 && Op.isFI()) {
+ Fi = Op.getImm();
+ } else if (HasModifiers &&
+ isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+ } else if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
+ } else if (Op.isImm() &&
+ Desc.OpInfo[Inst.getNumOperands()].RegClass != -1) {
+ assert(!HasModifiers && "Case should be unreachable with modifiers");
+ assert(!Op.IsImmKindLiteral() && "Cannot use literal with DPP");
+ Op.addImmOperands(Inst, 1);
+ } else if (Op.isImm()) {
+ OptionalIdx[Op.getImmTy()] = I;
+ } else {
+ llvm_unreachable("unhandled operand type");
+ }
+ }
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
+ }
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+ }
+ if (Desc.TSFlags & SIInstrFlags::VOP3P)
+ cvtVOP3P(Inst, Operands, OptionalIdx);
+ else if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOpSel);
+ }
+
+ if (IsDPP8) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDPP8);
+ using namespace llvm::AMDGPU::DPP;
+ Inst.addOperand(MCOperand::createImm(Fi? DPP8_FI_1 : DPP8_FI_0));
+ } else {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppCtrl, 0xe4);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::fi) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppFi);
+ }
+ }
+}
+
+// Add dummy $old operand
+void AMDGPUAsmParser::cvtVOPCNoDstDPP(MCInst &Inst,
+ const OperandVector &Operands,
+ bool IsDPP8) {
+ Inst.addOperand(MCOperand::createReg(0));
+ cvtDPP(Inst, Operands, IsDPP8);
+}
+
void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) {
OptionalImmIndexMap OptionalIdx;
@@ -8352,7 +9114,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmParser() {
#define GET_MNEMONIC_CHECKER
#include "AMDGPUGenAsmMatcher.inc"
-// This fuction should be defined after auto-generated include so that we have
+// This function should be defined after auto-generated include so that we have
// MatchClassKind enum defined
unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
unsigned Kind) {
@@ -8431,3 +9193,27 @@ OperandMatchResultTy AMDGPUAsmParser::parseEndpgmOp(OperandVector &Operands) {
}
bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); }
+
+//===----------------------------------------------------------------------===//
+// LDSDIR
+//===----------------------------------------------------------------------===//
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultWaitVDST() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyWaitVDST);
+}
+
+bool AMDGPUOperand::isWaitVDST() const {
+ return isImmTy(ImmTyWaitVDST) && isUInt<4>(getImm());
+}
+
+//===----------------------------------------------------------------------===//
+// VINTERP
+//===----------------------------------------------------------------------===//
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultWaitEXP() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyWaitEXP);
+}
+
+bool AMDGPUOperand::isWaitEXP() const {
+ return isImmTy(ImmTyWaitEXP) && isUInt<3>(getImm());
+}
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index a535c8cc0918..a087323e5de7 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -35,11 +35,6 @@ class MUBUFAddr64Table <bit is_addr64, string Name> {
string OpName = Name;
}
-class MUBUFLdsTable <bit is_lds, string Name> {
- bit IsLds = is_lds;
- string OpName = Name;
-}
-
class MTBUFAddr64Table <bit is_addr64, string Name> {
bit IsAddr64 = is_addr64;
string OpName = Name;
@@ -100,8 +95,8 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> sccb_value = 0;
}
-class MTBUF_Real <MTBUF_Pseudo ps> :
- InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
+class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
+ InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []> {
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -136,7 +131,7 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
bits<3> nfmt = format{6-4};
// GFX90A+ only: instruction uses AccVGPR for data
- // Bit superceedes tfe.
+ // Bit supersedes tfe.
bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
}
@@ -320,7 +315,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> idxen = 0;
bits<1> addr64 = 0;
bits<1> lds = 0;
- bits<1> has_vdata = 1;
+ bits<1> has_vdata = !not(lds);
bits<1> has_vaddr = 1;
bits<1> has_glc = 1;
bits<1> has_dlc = 1;
@@ -337,8 +332,8 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> IsBufferInv = 0;
}
-class MUBUF_Real <MUBUF_Pseudo ps> :
- InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
+class MUBUF_Real <MUBUF_Pseudo ps, string real_name = ps.Mnemonic> :
+ InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []> {
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -360,6 +355,8 @@ class MUBUF_Real <MUBUF_Pseudo ps> :
let mayStore = ps.mayStore;
let IsAtomicRet = ps.IsAtomicRet;
let IsAtomicNoRet = ps.IsAtomicNoRet;
+ let VALU = ps.VALU;
+ let LGKM_CNT = ps.LGKM_CNT;
bits<12> offset;
bits<5> cpol;
@@ -370,8 +367,8 @@ class MUBUF_Real <MUBUF_Pseudo ps> :
bits<8> soffset;
// GFX90A+ only: instruction uses AccVGPR for data
- // Bit superceedes tfe.
- bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
+ // Bit supersedes tfe.
+ bits<1> acc = !if(ps.has_vdata, vdata{9}, !if(ps.lds, ?, 0));
}
@@ -486,16 +483,17 @@ class MUBUF_Load_Pseudo <string opName,
ValueType vdata_vt,
bit HasTiedDest = 0,
bit isLds = 0,
+ bit isLdsOpc = 0,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind,
RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret,
RegisterOperand vdata_op = getLdStRegisterOperand<vdata_rc>.ret>
: MUBUF_Pseudo<opName,
- (outs vdata_op:$vdata),
+ !if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)),
!con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
!if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))),
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol" #
+ !if(!or(isLds, isLdsOpc), " ", " $vdata, ") # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol" #
!if(isLds, " lds", "$tfe") # "$swz",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
@@ -504,13 +502,16 @@ class MUBUF_Load_Pseudo <string opName,
let AsmMatchConverter = !if(isLds, "cvtMubufLds", "cvtMubuf");
let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
+ let LGKM_CNT = isLds;
+ let has_vdata = !not(isLdsOpc);
let mayLoad = 1;
- let mayStore = 0;
+ let mayStore = isLds;
let maybeAtomic = 1;
- let Uses = !if(isLds, [EXEC, M0], [EXEC]);
+ let Uses = !if(!or(isLds, isLdsOpc) , [EXEC, M0], [EXEC]);
let has_tfe = !not(isLds);
let lds = isLds;
let elements = getMUBUFElements<vdata_vt>.ret;
+ let VALU = isLds;
}
class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat <
@@ -563,6 +564,20 @@ multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32> {
defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>;
}
+multiclass MUBUF_Pseudo_Loads_LDSOpc<string opName,
+ ValueType load_vt = i32,
+ bit TiedDest = 0,
+ bit isLds = 0,
+ bit isLdsOpc = 1> {
+
+ defvar legal_load_vt = !if(!eq(!cast<string>(load_vt), !cast<string>(v3f16)), v4f16, load_vt);
+
+ def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds, isLdsOpc>;
+ def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, isLdsOpc>;
+ def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, isLdsOpc>;
+ def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, isLdsOpc>;
+}
+
class MUBUF_Store_Pseudo <string opName,
int addrKind,
ValueType store_vt,
@@ -615,7 +630,8 @@ class MUBUF_Pseudo_Store_Lds<string opName>
(outs),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol:$cpol, SWZ:$swz),
" $srsrc, $soffset$offset lds$cpol$swz"> {
- let mayLoad = 0;
+ let LGKM_CNT = 1;
+ let mayLoad = 1;
let mayStore = 1;
let maybeAtomic = 1;
@@ -623,6 +639,7 @@ class MUBUF_Pseudo_Store_Lds<string opName>
let has_vaddr = 0;
let has_tfe = 0;
let lds = 1;
+ let VALU = 1;
let Uses = [EXEC, M0];
let AsmMatchConverter = "cvtMubufLds";
@@ -785,7 +802,7 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
multiclass MUBUF_Pseudo_Atomics <string opName,
RegisterClass vdataClass,
ValueType vdataType,
- SDPatternOperator atomic> :
+ SDPatternOperator atomic = null_frag> :
MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>,
MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>;
@@ -898,6 +915,29 @@ defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
"buffer_load_dwordx4", v4i32
>;
+defm BUFFER_LOAD_LDS_B32 : MUBUF_Pseudo_Loads_LDSOpc <
+ "buffer_load_lds_b32", i32
+>;
+defm BUFFER_LOAD_LDS_FORMAT_X : MUBUF_Pseudo_Loads_LDSOpc <
+ "buffer_load_lds_format_x", f32
+>;
+defm BUFFER_LOAD_LDS_I8 : MUBUF_Pseudo_Loads_LDSOpc <
+ "buffer_load_lds_i8", i32
+>;
+defm BUFFER_LOAD_LDS_I16 : MUBUF_Pseudo_Loads_LDSOpc <
+ "buffer_load_lds_i16", i32
+>;
+defm BUFFER_LOAD_LDS_U8 : MUBUF_Pseudo_Loads_LDSOpc <
+ "buffer_load_lds_u8", i32
+>;
+defm BUFFER_LOAD_LDS_U16 : MUBUF_Pseudo_Loads_LDSOpc <
+ "buffer_load_lds_u16", i32
+>;
+
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_16_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
@@ -909,21 +949,6 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>;
-// This is not described in AMD documentation,
-// but 'lds' versions of these opcodes are available
-// in at least GFX8+ chips. See Bug 37653.
-let SubtargetPredicate = isGFX8GFX9 in {
-defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx2", v2i32, 0, 1
->;
-defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx3", v3i32, 0, 1
->;
-defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx4", v4i32, 0, 1
->;
-}
-
defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
"buffer_store_byte", i32, truncstorei8_global
>;
@@ -943,82 +968,82 @@ defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
"buffer_store_dwordx4", v4i32, store_global
>;
defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
- "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global_32
+ "buffer_atomic_swap", VGPR_32, i32
>;
defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics <
- "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag
+ "buffer_atomic_cmpswap", VReg_64, v2i32
>;
defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics <
- "buffer_atomic_add", VGPR_32, i32, atomic_load_add_global_32
+ "buffer_atomic_add", VGPR_32, i32
>;
defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics <
- "buffer_atomic_sub", VGPR_32, i32, atomic_load_sub_global_32
+ "buffer_atomic_sub", VGPR_32, i32
>;
defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smin", VGPR_32, i32, atomic_load_min_global_32
+ "buffer_atomic_smin", VGPR_32, i32
>;
defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umin", VGPR_32, i32, atomic_load_umin_global_32
+ "buffer_atomic_umin", VGPR_32, i32
>;
defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smax", VGPR_32, i32, atomic_load_max_global_32
+ "buffer_atomic_smax", VGPR_32, i32
>;
defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umax", VGPR_32, i32, atomic_load_umax_global_32
+ "buffer_atomic_umax", VGPR_32, i32
>;
defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics <
- "buffer_atomic_and", VGPR_32, i32, atomic_load_and_global_32
+ "buffer_atomic_and", VGPR_32, i32
>;
defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics <
- "buffer_atomic_or", VGPR_32, i32, atomic_load_or_global_32
+ "buffer_atomic_or", VGPR_32, i32
>;
defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics <
- "buffer_atomic_xor", VGPR_32, i32, atomic_load_xor_global_32
+ "buffer_atomic_xor", VGPR_32, i32
>;
defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics <
- "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global_32
+ "buffer_atomic_inc", VGPR_32, i32
>;
defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics <
- "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global_32
+ "buffer_atomic_dec", VGPR_32, i32
>;
defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global_64
+ "buffer_atomic_swap_x2", VReg_64, i64
>;
defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag
+ "buffer_atomic_cmpswap_x2", VReg_128, v2i64
>;
defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_add_x2", VReg_64, i64, atomic_load_add_global_64
+ "buffer_atomic_add_x2", VReg_64, i64
>;
defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_sub_x2", VReg_64, i64, atomic_load_sub_global_64
+ "buffer_atomic_sub_x2", VReg_64, i64
>;
defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smin_x2", VReg_64, i64, atomic_load_min_global_64
+ "buffer_atomic_smin_x2", VReg_64, i64
>;
defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umin_x2", VReg_64, i64, atomic_load_umin_global_64
+ "buffer_atomic_umin_x2", VReg_64, i64
>;
defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smax_x2", VReg_64, i64, atomic_load_max_global_64
+ "buffer_atomic_smax_x2", VReg_64, i64
>;
defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umax_x2", VReg_64, i64, atomic_load_umax_global_64
+ "buffer_atomic_umax_x2", VReg_64, i64
>;
defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_and_x2", VReg_64, i64, atomic_load_and_global_64
+ "buffer_atomic_and_x2", VReg_64, i64
>;
defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_or_x2", VReg_64, i64, atomic_load_or_global_64
+ "buffer_atomic_or_x2", VReg_64, i64
>;
defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_xor_x2", VReg_64, i64, atomic_load_xor_global_64
+ "buffer_atomic_xor_x2", VReg_64, i64
>;
defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global_64
+ "buffer_atomic_inc_x2", VReg_64, i64
>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64
+ "buffer_atomic_dec_x2", VReg_64, i64
>;
let SubtargetPredicate = HasGFX10_BEncoding in
@@ -1040,7 +1065,7 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
int_amdgcn_buffer_wbinvl1_sc>;
}
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
+let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
"buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag
@@ -1051,6 +1076,11 @@ defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
"buffer_atomic_fmax", VGPR_32, f32, null_frag
>;
+
+}
+
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+
defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
"buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
>;
@@ -1109,23 +1139,25 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
int_amdgcn_buffer_wbinvl1>;
-let SubtargetPredicate = HasAtomicFaddInsts in {
-defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
+let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN<
"buffer_atomic_add_f32", VGPR_32, f32
>;
+
+let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16
>;
-let OtherPredicates = [isGFX90APlus] in {
-defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN <
+let OtherPredicates = [HasAtomicFaddRtnInsts] in
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN<
"buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_32
>;
+
+let OtherPredicates = [isGFX90APlus] in
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_32
>;
-}
-} // End SubtargetPredicate = HasAtomicFaddInsts
//===----------------------------------------------------------------------===//
// MTBUF Instructions
@@ -1175,15 +1207,28 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
let SubtargetPredicate = isGFX90APlus in {
def BUFFER_WBL2 : MUBUF_Invalidate<"buffer_wbl2"> {
+ let has_glc = 1;
+ let has_sccb = 1;
+ let InOperandList = (ins CPol_0:$cpol);
+ let AsmOperands = "$cpol";
}
def BUFFER_INVL2 : MUBUF_Invalidate<"buffer_invl2"> {
+ let SubtargetPredicate = isGFX90AOnly;
}
- defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
- defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
- defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+ defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>;
+ defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>;
+ defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>;
} // End SubtargetPredicate = isGFX90APlus
+def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
+ let SubtargetPredicate = isGFX940Plus;
+ let has_glc = 1;
+ let has_sccb = 1;
+ let InOperandList = (ins CPol_0:$cpol);
+ let AsmOperands = "$cpol";
+}
+
let SubtargetPredicate = isGFX10Plus in {
def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">;
def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">;
@@ -1364,75 +1409,169 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
// buffer_atomic patterns
//===----------------------------------------------------------------------===//
-multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
- string opcode> {
+multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> {
+ foreach RtnMode = ["ret", "noret"] in {
+
+ defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode
+ # !if(isIntr, "", "_" # vt.Size));
+ defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+
+ def : GCNPat<
+ (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), vt:$vdata_in)),
+ (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset)
+ >;
+
+ def : GCNPat<
+ (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset),
+ vt:$vdata_in)),
+ (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in,
+ VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset)
+ >;
+
+ } // end foreach RtnMode
+}
+
+multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> {
+ defm : BufferAtomicPat<OpPrefix, vt, Inst, /* isIntr */ 1>;
+}
+
+multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> {
+ foreach RtnMode = ["ret", "noret"] in {
+
+ defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global_" # RtnMode
+ # "_" # vt.Size);
+ defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+
+ defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
+ getVregSrcForVT<data_vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset,
+ offset:$offset);
+ def : GCNPat<
+ (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), data_vt:$vdata_in)),
+ !if(!eq(RtnMode, "ret"),
+ (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS OffsetResDag, getVregSrcForVT<data_vt>.ret)),
+ !if(!eq(vt, i32), sub0, sub0_sub1)),
+ OffsetResDag)
+ >;
+
+ defvar Addr64ResDag = (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix)
+ getVregSrcForVT<data_vt>.ret:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset);
+ def : GCNPat<
+ (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset),
+ data_vt:$vdata_in)),
+ !if(!eq(RtnMode, "ret"),
+ (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS Addr64ResDag, getVregSrcForVT<data_vt>.ret)),
+ !if(!eq(vt, i32), sub0, sub0_sub1)),
+ Addr64ResDag)
+ >;
+
+ } // end foreach RtnMode
+}
+
+foreach Ty = [i32, i64] in {
+
+defvar Suffix = !if(!eq(Ty, i64), "_X2", "");
+
+defm : BufferAtomicPat<"atomic_swap_global", Ty, "BUFFER_ATOMIC_SWAP" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_add_global", Ty, "BUFFER_ATOMIC_ADD" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_sub_global", Ty, "BUFFER_ATOMIC_SUB" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_min_global", Ty, "BUFFER_ATOMIC_SMIN" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_umin_global", Ty, "BUFFER_ATOMIC_UMIN" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_max_global", Ty, "BUFFER_ATOMIC_SMAX" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_umax_global", Ty, "BUFFER_ATOMIC_UMAX" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_and_global", Ty, "BUFFER_ATOMIC_AND" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_or_global", Ty, "BUFFER_ATOMIC_OR" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_xor_global", Ty, "BUFFER_ATOMIC_XOR" # Suffix>;
+defm : BufferAtomicPat<"atomic_inc_global", Ty, "BUFFER_ATOMIC_INC" # Suffix>;
+defm : BufferAtomicPat<"atomic_dec_global", Ty, "BUFFER_ATOMIC_DEC" # Suffix>;
+
+} // end foreach Ty
+
+defm : BufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">;
+defm : BufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">;
+
+multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
+ list<string> RtnModes = ["ret", "noret"]> {
+ foreach RtnMode = RtnModes in {
+
+ defvar Op = !cast<SDPatternOperator>(!if(!eq(RtnMode, "none"),
+ OpPrefix, OpPrefix # "_" # RtnMode));
+ defvar InstSuffix = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")),
+ "_RTN", "");
+ defvar CachePolicy = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")),
+ (set_glc $cachepolicy), (timm:$cachepolicy));
+
def : GCNPat<
- (vt (name vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset,
+ (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset,
timm:$offset, timm:$cachepolicy, 0)),
- (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN)
+ (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (set_glc $cachepolicy))
+ (as_i16timm $offset), CachePolicy)
>;
def : GCNPat<
- (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset,
+ (vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset,
timm:$offset, timm:$cachepolicy, timm)),
- (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
- VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (set_glc $cachepolicy))
+ (!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix)
+ getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc,
+ SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy)
>;
def : GCNPat<
- (vt (name vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset,
+ (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset,
i32:$soffset, timm:$offset, timm:$cachepolicy, 0)),
- (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
- VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (set_glc $cachepolicy))
+ (!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix)
+ getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc,
+ SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy)
>;
def : GCNPat<
- (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset,
+ (vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset,
i32:$soffset, timm:$offset, timm:$cachepolicy, timm)),
- (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN)
+ (!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (set_glc $cachepolicy))
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy)
>;
+
+ } // end foreach RtnMode
}
-defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i32, "BUFFER_ATOMIC_SWAP">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_swap, f32, "BUFFER_ATOMIC_SWAP">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_add, i32, "BUFFER_ATOMIC_ADD">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i32, "BUFFER_ATOMIC_SUB">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i32, "BUFFER_ATOMIC_SMIN">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_umin, i32, "BUFFER_ATOMIC_UMIN">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_smax, i32, "BUFFER_ATOMIC_SMAX">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i32, "BUFFER_ATOMIC_UMAX">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_and, i32, "BUFFER_ATOMIC_AND">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i32, "BUFFER_ATOMIC_INC">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i32, "BUFFER_ATOMIC_DEC">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_csub, i32, "BUFFER_ATOMIC_CSUB">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64, "BUFFER_ATOMIC_ADD_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i64, "BUFFER_ATOMIC_SMIN_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_umin, i64, "BUFFER_ATOMIC_UMIN_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_smax, i64, "BUFFER_ATOMIC_SMAX_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i64, "BUFFER_ATOMIC_UMAX_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_and, i64, "BUFFER_ATOMIC_AND_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_or, i64, "BUFFER_ATOMIC_OR_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", i32, "BUFFER_ATOMIC_SWAP">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", f32, "BUFFER_ATOMIC_SWAP">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_add", i32, "BUFFER_ATOMIC_ADD">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_sub", i32, "BUFFER_ATOMIC_SUB">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_smin", i32, "BUFFER_ATOMIC_SMIN">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_umin", i32, "BUFFER_ATOMIC_UMIN">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_smax", i32, "BUFFER_ATOMIC_SMAX">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_umax", i32, "BUFFER_ATOMIC_UMAX">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_and", i32, "BUFFER_ATOMIC_AND">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_or", i32, "BUFFER_ATOMIC_OR">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i32, "BUFFER_ATOMIC_XOR">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i32, "BUFFER_ATOMIC_INC">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i32, "BUFFER_ATOMIC_DEC">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["none"]>;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", i64, "BUFFER_ATOMIC_SWAP_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_add", i64, "BUFFER_ATOMIC_ADD_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_sub", i64, "BUFFER_ATOMIC_SUB_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_smin", i64, "BUFFER_ATOMIC_SMIN_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_umin", i64, "BUFFER_ATOMIC_UMIN_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_smax", i64, "BUFFER_ATOMIC_SMAX_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_umax", i64, "BUFFER_ATOMIC_UMAX_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_and", i64, "BUFFER_ATOMIC_AND_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_or", i64, "BUFFER_ATOMIC_OR_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i64, "BUFFER_ATOMIC_XOR_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i64, "BUFFER_ATOMIC_INC_X2">;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">;
+let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">;
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
+}
let SubtargetPredicate = isGFX6GFX7GFX10 in {
- defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f32, "BUFFER_ATOMIC_FMIN">;
- defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f32, "BUFFER_ATOMIC_FMAX">;
- defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f64, "BUFFER_ATOMIC_FMIN_X2">;
- defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f64, "BUFFER_ATOMIC_FMAX_X2">;
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_FMIN_X2">;
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_FMAX_X2">;
}
class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag <
@@ -1482,71 +1621,89 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
>;
}
-let SubtargetPredicate = HasAtomicFaddInsts in {
+let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">;
+
+let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in
defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
-}
+
+let SubtargetPredicate = HasAtomicFaddRtnInsts in
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32">;
let SubtargetPredicate = isGFX90APlus in {
- defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">;
- defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
+ defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">;
+ defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
+ defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
- defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f64, "BUFFER_ATOMIC_ADD_F64">;
- defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f64, "BUFFER_ATOMIC_MIN_F64">;
- defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f64, "BUFFER_ATOMIC_MAX_F64">;
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">;
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
} // End SubtargetPredicate = isGFX90APlus
+foreach RtnMode = ["ret", "noret"] in {
+
+defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap # "_" # RtnMode);
+defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy),
+ (timm:$cachepolicy));
+
+defvar OffsetResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFSET" # InstSuffix)
+ (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy);
def : GCNPat<
- (SIbuffer_atomic_cmpswap
+ (Op
i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset,
timm:$offset, timm:$cachepolicy, 0),
- (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
- (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
- (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (set_glc $cachepolicy)), VReg_64)), sub0)
+ !if(!eq(RtnMode, "ret"),
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS OffsetResDag, VReg_64)), sub0),
+ OffsetResDag)
>;
+defvar IdxenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_IDXEN" # InstSuffix)
+ (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+ VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ CachePolicy);
def : GCNPat<
- (SIbuffer_atomic_cmpswap
+ (Op
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
0, i32:$soffset, timm:$offset,
timm:$cachepolicy, timm),
- (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
- (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
- (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (set_glc $cachepolicy)), VReg_64)),
- sub0)
+ !if(!eq(RtnMode, "ret"),
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS IdxenResDag, VReg_64)), sub0),
+ IdxenResDag)
>;
+defvar OffenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFEN" # InstSuffix)
+ (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+ VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ CachePolicy);
def : GCNPat<
- (SIbuffer_atomic_cmpswap
+ (Op
i32:$data, i32:$cmp, v4i32:$rsrc, 0,
i32:$voffset, i32:$soffset, timm:$offset,
timm:$cachepolicy, 0),
- (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
- (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
- (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (set_glc $cachepolicy)), VReg_64)),
- sub0)
+ !if(!eq(RtnMode, "ret"),
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS OffenResDag, VReg_64)), sub0),
+ OffenResDag)
>;
+defvar BothenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_BOTHEN" # InstSuffix)
+ (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy);
def : GCNPat<
- (SIbuffer_atomic_cmpswap
+ (Op
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
i32:$voffset, i32:$soffset, timm:$offset,
timm:$cachepolicy, timm),
- (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
- (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
- (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (set_glc $cachepolicy)), VReg_64)),
- sub0)
+ !if(!eq(RtnMode, "ret"),
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS BothenResDag, VReg_64)), sub0),
+ BothenResDag)
>;
+} // end foreach RtnMode
+
class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
PatFrag constant_ld> : GCNPat <
(vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
@@ -1682,8 +1839,12 @@ multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo In
>;
}
let SubtargetPredicate = isGFX6GFX7 in {
-defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, atomic_store_global_32>;
-defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, atomic_store_global_64>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_BYTE_ADDR64, BUFFER_STORE_BYTE_OFFSET, i32, atomic_store_8_global>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_BYTE_ADDR64, BUFFER_STORE_BYTE_OFFSET, i16, atomic_store_8_global>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_SHORT_ADDR64, BUFFER_STORE_SHORT_OFFSET, i32, atomic_store_16_global>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_SHORT_ADDR64, BUFFER_STORE_SHORT_OFFSET, i16, atomic_store_16_global>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, atomic_store_32_global>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, atomic_store_64_global>;
} // End Predicates = isGFX6GFX7
@@ -1731,7 +1892,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OF
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>;
-let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in {
+let OtherPredicates = [HasD16LoadStore, DisableFlatScratch] in {
// Hiding the extract high pattern in the PatFrag seems to not
// automatically increase the complexity.
let AddedComplexity = 1 in {
@@ -1882,24 +2043,41 @@ let SubtargetPredicate = HasPackedD16VMem in {
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// Base ENC_MUBUF for GFX6, GFX7, GFX10.
+// Base ENC_MUBUF for GFX6, GFX7, GFX10, GFX11.
//===----------------------------------------------------------------------===//
-class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
- MUBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, ef> {
+class Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11 <MUBUF_Pseudo ps, int ef,
+ string real_name = ps.Mnemonic> :
+ MUBUF_Real<ps, real_name>, Enc64, SIMCInstr<ps.PseudoInstr, ef> {
let Inst{11-0} = !if(ps.has_offset, offset, ?);
+ let Inst{31-26} = 0x38;
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
+ let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+class MUBUF_Real_gfx11<bits<8> op, MUBUF_Pseudo ps,
+ string real_name = ps.Mnemonic> :
+ Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, SIEncodingFamily.GFX11, real_name> {
+ let Inst{12} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
+ let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
+ let Inst{25-18} = op;
+ let Inst{53} = !if(ps.has_tfe, tfe, ?);
+ let Inst{54} = ps.offen;
+ let Inst{55} = ps.idxen;
+}
+
+class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
+ Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef> {
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
let Inst{16} = ps.lds;
let Inst{24-18} = op;
- let Inst{31-26} = 0x38;
- let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
- let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{55} = !if(ps.has_tfe, tfe, ?);
- let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
class MUBUF_Real_gfx10<bits<8> op, MUBUF_Pseudo ps> :
@@ -1914,10 +2092,155 @@ class MUBUF_Real_gfx6_gfx7<bits<8> op, MUBUF_Pseudo ps> :
}
//===----------------------------------------------------------------------===//
+// MUBUF - GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in
+multiclass MUBUF_Real_AllAddr_gfx11_Renamed_Impl<bits<8> op, string real_name> {
+ def _BOTHEN_gfx11 :
+ MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN"), real_name>,
+ AtomicNoRet<NAME # "_BOTHEN_gfx11", 0>;
+ def _IDXEN_gfx11 :
+ MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN"), real_name>,
+ AtomicNoRet<NAME # "_IDXEN_gfx11", 0>;
+ def _OFFEN_gfx11 :
+ MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN"), real_name>,
+ AtomicNoRet<NAME # "_OFFEN_gfx11", 0>;
+ def _OFFSET_gfx11 :
+ MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET"), real_name>,
+ AtomicNoRet<NAME # "_OFFSET_gfx11", 0>;
+}
+
+multiclass MUBUF_Real_AllAddr_gfx11_Impl<bits<8> op, MUBUF_Pseudo ps> :
+ MUBUF_Real_AllAddr_gfx11_Renamed_Impl<op, ps.Mnemonic>;
+multiclass MUBUF_Real_AllAddr_gfx11<bits<8> op> :
+ MUBUF_Real_AllAddr_gfx11_Impl<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+
+class Pre_gfx11_MUBUF_Name <MUBUF_Pseudo ps, string real_name> :
+ MnemonicAlias<ps.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+multiclass MUBUF_Real_AllAddr_gfx11_Renamed<bits<8> op, string real_name> :
+ MUBUF_Real_AllAddr_gfx11_Renamed_Impl<op, real_name> {
+ def : Pre_gfx11_MUBUF_Name<!cast<MUBUF_Pseudo>(NAME#"_BOTHEN"), real_name>;
+}
+
+let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in
+multiclass MUBUF_Real_Atomics_RTN_gfx11_Renamed<bits<8> op, string real_name> {
+ def _BOTHEN_RTN_gfx11 :
+ MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN"), real_name>,
+ AtomicNoRet<NAME # "_BOTHEN_gfx11", 1>;
+ def _IDXEN_RTN_gfx11 :
+ MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN"), real_name>,
+ AtomicNoRet<NAME # "_IDXEN_gfx11", 1>;
+ def _OFFEN_RTN_gfx11 :
+ MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN"), real_name>,
+ AtomicNoRet<NAME # "_OFFEN_gfx11", 1>;
+ def _OFFSET_RTN_gfx11 :
+ MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN"), real_name>,
+ AtomicNoRet<NAME # "_OFFSET_gfx11", 1>;
+}
+
+multiclass MUBUF_Real_Atomics_RTN_gfx11_impl<bits<8> op, MUBUF_Pseudo ps> :
+ MUBUF_Real_Atomics_RTN_gfx11_Renamed<op, ps.Mnemonic>;
+multiclass MUBUF_Real_Atomics_RTN_gfx11<bits<8> op> :
+ MUBUF_Real_Atomics_RTN_gfx11_impl<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+
+multiclass MUBUF_Real_Atomics_gfx11<bits<8> op> :
+ MUBUF_Real_AllAddr_gfx11<op>,
+ MUBUF_Real_Atomics_RTN_gfx11<op>;
+
+multiclass MUBUF_Real_Atomics_gfx11_Renamed<bits<8> op, string real_name> :
+ MUBUF_Real_AllAddr_gfx11_Renamed<op, real_name>,
+ MUBUF_Real_Atomics_RTN_gfx11_Renamed<op, real_name>;
+
+let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
+def BUFFER_GL0_INV_gfx11 : MUBUF_Real_gfx11<0x02B, BUFFER_GL0_INV>;
+def BUFFER_GL1_INV_gfx11 : MUBUF_Real_gfx11<0x02C, BUFFER_GL1_INV>;
+}
+
+defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_gfx11_Renamed<0x014, "buffer_load_b32">;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_gfx11_Renamed<0x015, "buffer_load_b64">;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_gfx11_Renamed<0x016, "buffer_load_b96">;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_gfx11_Renamed<0x017, "buffer_load_b128">;
+defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_gfx11_Renamed<0x020, "buffer_load_d16_b16">;
+defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x008, "buffer_load_d16_format_x">;
+defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx11_Renamed<0x009, "buffer_load_d16_format_xy">;
+defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx11_Renamed<0x00a, "buffer_load_d16_format_xyz">;
+defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx11_Renamed<0x00b, "buffer_load_d16_format_xyzw">;
+defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x023, "buffer_load_d16_hi_b16">;
+defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x026, "buffer_load_d16_hi_format_x">;
+defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x022, "buffer_load_d16_hi_i8">;
+defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x021, "buffer_load_d16_hi_u8">;
+defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01f, "buffer_load_d16_i8">;
+defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01e, "buffer_load_d16_u8">;
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_gfx11<0x000>;
+defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_gfx11<0x001>;
+defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx11<0x002>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx11<0x003>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_gfx11_Renamed<0x011, "buffer_load_i8">;
+defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_gfx11_Renamed<0x013, "buffer_load_i16">;
+defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_gfx11_Renamed<0x010, "buffer_load_u8">;
+defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_gfx11_Renamed<0x012, "buffer_load_u16">;
+defm BUFFER_LOAD_LDS_B32 : MUBUF_Real_AllAddr_gfx11<0x031>;
+defm BUFFER_LOAD_LDS_FORMAT_X : MUBUF_Real_AllAddr_gfx11<0x032>;
+defm BUFFER_LOAD_LDS_I8 : MUBUF_Real_AllAddr_gfx11<0x02e>;
+defm BUFFER_LOAD_LDS_I16 : MUBUF_Real_AllAddr_gfx11<0x030>;
+defm BUFFER_LOAD_LDS_U8 : MUBUF_Real_AllAddr_gfx11<0x02d>;
+defm BUFFER_LOAD_LDS_U16 : MUBUF_Real_AllAddr_gfx11<0x02f>;
+defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_gfx11_Renamed<0x018, "buffer_store_b8">;
+defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_gfx11_Renamed<0x019, "buffer_store_b16">;
+defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_gfx11_Renamed<0x01A, "buffer_store_b32">;
+defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01B, "buffer_store_b64">;
+defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01C, "buffer_store_b96">;
+defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01D, "buffer_store_b128">;
+defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x00C, "buffer_store_d16_format_x">;
+defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx11_Renamed<0x00D, "buffer_store_d16_format_xy">;
+defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx11_Renamed<0x00E, "buffer_store_d16_format_xyz">;
+defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx11_Renamed<0x00F, "buffer_store_d16_format_xyzw">;
+defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x024, "buffer_store_d16_hi_b8">;
+defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x025, "buffer_store_d16_hi_b16">;
+defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x027, "buffer_store_d16_hi_format_x">;
+defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_gfx11<0x004>;
+defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_gfx11<0x005>;
+defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx11<0x006>;
+defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx11<0x007>;
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomics_gfx11<0x056>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomics_gfx11_Renamed<0x035, "buffer_atomic_add_u32">;
+defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x043, "buffer_atomic_add_u64">;
+defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomics_gfx11_Renamed<0x03C, "buffer_atomic_and_b32">;
+defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x049, "buffer_atomic_and_b64">;
+defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomics_gfx11_Renamed<0x034, "buffer_atomic_cmpswap_b32">;
+defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x042, "buffer_atomic_cmpswap_b64">;
+defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx11_Renamed<0x050, "buffer_atomic_cmpswap_f32">;
+defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_RTN_gfx11_Renamed<0x037, "buffer_atomic_csub_u32">;
+def : MnemonicAlias<"buffer_atomic_csub", "buffer_atomic_csub_u32">, Requires<[isGFX11Plus]>;
+defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomics_gfx11_Renamed<0x040, "buffer_atomic_dec_u32">;
+defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x04D, "buffer_atomic_dec_u64">;
+defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomics_gfx11_Renamed<0x03F, "buffer_atomic_inc_u32">;
+defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x04C, "buffer_atomic_inc_u64">;
+defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx11_Renamed<0x052, "buffer_atomic_max_f32">;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomics_gfx11_Renamed<0x03A, "buffer_atomic_max_i32">;
+defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x047, "buffer_atomic_max_i64">;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomics_gfx11_Renamed<0x03B, "buffer_atomic_max_u32">;
+defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x048, "buffer_atomic_max_u64">;
+defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx11_Renamed<0x051, "buffer_atomic_min_f32">;
+defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomics_gfx11_Renamed<0x038, "buffer_atomic_min_i32">;
+defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x045, "buffer_atomic_min_i64">;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomics_gfx11_Renamed<0x039, "buffer_atomic_min_u32">;
+defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x046, "buffer_atomic_min_u64">;
+defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomics_gfx11_Renamed<0x03D, "buffer_atomic_or_b32">;
+defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x04A, "buffer_atomic_or_b64">;
+defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomics_gfx11_Renamed<0x036, "buffer_atomic_sub_u32">;
+defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x044, "buffer_atomic_sub_u64">;
+defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomics_gfx11_Renamed<0x033, "buffer_atomic_swap_b32">;
+defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x041, "buffer_atomic_swap_b64">;
+defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomics_gfx11_Renamed<0x03E, "buffer_atomic_xor_b32">;
+defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x04B, "buffer_atomic_xor_b64">;
+
+//===----------------------------------------------------------------------===//
// MUBUF - GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
multiclass MUBUF_Real_AllAddr_gfx10<bits<8> op> {
def _BOTHEN_gfx10 :
MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
@@ -1929,23 +2252,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
}
multiclass MUBUF_Real_AllAddr_Lds_gfx10<bits<8> op> {
- def _OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
- MUBUFLdsTable<0, NAME # "_OFFSET_gfx10">;
- def _OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
- MUBUFLdsTable<0, NAME # "_OFFEN_gfx10">;
- def _IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
- MUBUFLdsTable<0, NAME # "_IDXEN_gfx10">;
- def _BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
- MUBUFLdsTable<0, NAME # "_BOTHEN_gfx10">;
+ def _OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
- def _LDS_OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
- MUBUFLdsTable<1, NAME # "_OFFSET_gfx10">;
- def _LDS_OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
- MUBUFLdsTable<1, NAME # "_OFFEN_gfx10">;
- def _LDS_IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
- MUBUFLdsTable<1, NAME # "_IDXEN_gfx10">;
- def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
- MUBUFLdsTable<1, NAME # "_BOTHEN_gfx10">;
+ def _LDS_OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>;
+ def _LDS_OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>;
+ def _LDS_IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>;
+ def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>;
}
multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> {
def _BOTHEN_RTN_gfx10 :
@@ -1976,7 +2291,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
AtomicNoRet<NAME # "_OFFSET_gfx10", 0>;
}
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>;
defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx10<0x01b>;
@@ -2033,27 +2348,17 @@ let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
}
multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7<bits<8> op> {
- def _OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
- MUBUFLdsTable<0, NAME # "_OFFSET_gfx6_gfx7">;
- def _ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>,
- MUBUFLdsTable<0, NAME # "_ADDR64_gfx6_gfx7">;
- def _OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
- MUBUFLdsTable<0, NAME # "_OFFEN_gfx6_gfx7">;
- def _IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
- MUBUFLdsTable<0, NAME # "_IDXEN_gfx6_gfx7">;
- def _BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
- MUBUFLdsTable<0, NAME # "_BOTHEN_gfx6_gfx7">;
+ def _OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>;
+ def _OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
- def _LDS_OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
- MUBUFLdsTable<1, NAME # "_OFFSET_gfx6_gfx7">;
- def _LDS_ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>,
- MUBUFLdsTable<1, NAME # "_ADDR64_gfx6_gfx7">;
- def _LDS_OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
- MUBUFLdsTable<1, NAME # "_OFFEN_gfx6_gfx7">;
- def _LDS_IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
- MUBUFLdsTable<1, NAME # "_IDXEN_gfx6_gfx7">;
- def _LDS_BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
- MUBUFLdsTable<1, NAME # "_BOTHEN_gfx6_gfx7">;
+ def _LDS_OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>;
+ def _LDS_ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>;
+ def _LDS_OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>;
+ def _LDS_IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>;
+ def _LDS_BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>;
}
multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> {
def _ADDR64_gfx6_gfx7 :
@@ -2167,26 +2472,89 @@ defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>;
def BUFFER_WBINVL1_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<0x071, BUFFER_WBINVL1>;
//===----------------------------------------------------------------------===//
-// Base ENC_MTBUF for GFX6, GFX7, GFX10.
+// Base ENC_MTBUF for GFX6, GFX7, GFX10, GFX11.
//===----------------------------------------------------------------------===//
-class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> :
- MTBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, ef> {
+class Base_MTBUF_Real_gfx6_gfx7_gfx10_gfx11<MTBUF_Pseudo ps, int ef,
+ string real_name = ps.Mnemonic> :
+ MTBUF_Real<ps, real_name>, Enc64, SIMCInstr<ps.PseudoInstr, ef> {
let Inst{11-0} = !if(ps.has_offset, offset, ?);
- let Inst{12} = ps.offen;
- let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
- let Inst{18-16} = op;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+class Base_MTBUF_Real_gfx11<bits<4> op, MTBUF_Pseudo ps,
+ string real_name = ps.Mnemonic> :
+ Base_MTBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, SIEncodingFamily.GFX11, real_name> {
+ let Inst{12} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
+ let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value);
+ let Inst{18-15} = op;
+ let Inst{25-19} = format;
+ let Inst{53} = !if(ps.has_tfe, tfe, ?);
+ let Inst{54} = ps.offen;
+ let Inst{55} = ps.idxen;
+}
+
+class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> :
+ Base_MTBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef> {
+ let Inst{12} = ps.offen;
+ let Inst{13} = ps.idxen;
+ let Inst{18-16} = op;
let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{55} = !if(ps.has_tfe, tfe, ?);
- let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
//===----------------------------------------------------------------------===//
+// MTBUF - GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in
+multiclass MTBUF_Real_AllAddr_gfx11_Renamed_Impl<bits<4> op, string real_name> {
+ def _BOTHEN_gfx11 :
+ Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN"), real_name>;
+ def _IDXEN_gfx11 :
+ Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN"), real_name>;
+ def _OFFEN_gfx11 :
+ Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN"), real_name>;
+ def _OFFSET_gfx11 :
+ Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET"), real_name>;
+}
+
+multiclass MTBUF_Real_AllAddr_gfx11_Impl<bits<4> op, MTBUF_Pseudo ps>
+ : MTBUF_Real_AllAddr_gfx11_Renamed_Impl<op, ps.Mnemonic>;
+multiclass MTBUF_Real_AllAddr_gfx11<bits<4> op>
+ : MTBUF_Real_AllAddr_gfx11_Impl<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+
+
+class Pre_gfx11_MTBUF_Name <MTBUF_Pseudo ps, string real_name>
+ : MnemonicAlias<ps.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+multiclass MTBUF_Real_AllAddr_gfx11_Renamed<bits<4> op, string real_name>
+ : MTBUF_Real_AllAddr_gfx11_Renamed_Impl<op, real_name> {
+ def : Pre_gfx11_MTBUF_Name<!cast<MTBUF_Pseudo>(NAME#"_BOTHEN"), real_name>;
+}
+
+defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx11_Renamed<0x008, "tbuffer_load_d16_format_x">;
+defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx11_Renamed<0x009, "tbuffer_load_d16_format_xy">;
+defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx11_Renamed<0x00a, "tbuffer_load_d16_format_xyz">;
+defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx11_Renamed<0x00b, "tbuffer_load_d16_format_xyzw">;
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_gfx11<0x000>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_gfx11<0x001>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx11<0x002>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx11<0x003>;
+defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx11_Renamed<0x00c, "tbuffer_store_d16_format_x">;
+defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx11_Renamed<0x00d, "tbuffer_store_d16_format_xy">;
+defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx11_Renamed<0x00e, "tbuffer_store_d16_format_xyz">;
+defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx11_Renamed<0x00f, "tbuffer_store_d16_format_xyzw">;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_gfx11<0x004>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_gfx11<0x005>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx11<0x006>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx11<0x007>;
+
+//===----------------------------------------------------------------------===//
// MTBUF - GFX10.
//===----------------------------------------------------------------------===//
@@ -2197,7 +2565,7 @@ class MTBUF_Real_gfx10<bits<4> op, MTBUF_Pseudo ps> :
let Inst{53} = op{3};
}
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
multiclass MTBUF_Real_AllAddr_gfx10<bits<4> op> {
def _BOTHEN_gfx10 :
MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
@@ -2208,7 +2576,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
def _OFFSET_gfx10 :
MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
}
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx10<0x008>;
defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx10<0x009>;
@@ -2303,9 +2671,28 @@ class MUBUF_Real_gfx90a <bits<7> op, MUBUF_Pseudo ps,
let Inst{55} = acc;
}
+class MUBUF_Real_gfx940 <bits<7> op, MUBUF_Pseudo ps> :
+ MUBUF_Real_Base_vi<op, ps, SIEncodingFamily.GFX940> {
+ let AssemblerPredicate = isGFX940Plus;
+ let DecoderNamespace = "GFX9";
+ let AsmString = ps.Mnemonic # !subst("$tfe", "", ps.AsmOperands);
+
+ let Inst{55} = acc;
+}
+
multiclass MUBUF_Real_vi_gfx90a<bits<7> op, MUBUF_Pseudo ps> {
def _vi : MUBUF_Real_vi<op, ps>;
- def _gfx90a : MUBUF_Real_gfx90a<op, ps, !and(ps.has_sccb,!not(ps.FPAtomic))>;
+
+ foreach _ = BoolToList<!not(ps.FPAtomic)>.ret in
+ def _gfx90a : MUBUF_Real_gfx90a<op, ps>;
+
+ foreach _ = BoolToList<ps.FPAtomic>.ret in {
+ def _gfx90a : MUBUF_Real_gfx90a<op, ps, 0> {
+ let SubtargetPredicate = isGFX90AOnly;
+ let AssemblerPredicate = isGFX90AOnly;
+ }
+ def _gfx940 : MUBUF_Real_gfx940<op, ps>;
+ }
}
multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
@@ -2317,41 +2704,25 @@ multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> {
- def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
- MUBUFLdsTable<0, NAME # "_OFFSET_vi">;
- def _OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
- MUBUFLdsTable<0, NAME # "_OFFEN_vi">;
- def _IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
- MUBUFLdsTable<0, NAME # "_IDXEN_vi">;
- def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
- MUBUFLdsTable<0, NAME # "_BOTHEN_vi">;
+ def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
- def _LDS_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
- MUBUFLdsTable<1, NAME # "_OFFSET_vi">;
- def _LDS_OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
- MUBUFLdsTable<1, NAME # "_OFFEN_vi">;
- def _LDS_IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
- MUBUFLdsTable<1, NAME # "_IDXEN_vi">;
- def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
- MUBUFLdsTable<1, NAME # "_BOTHEN_vi">;
+ def _LDS_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>;
+ def _LDS_OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>;
+ def _LDS_IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>;
+ def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>;
- def _OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
- MUBUFLdsTable<0, NAME # "_OFFSET_gfx90a">;
- def _OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
- MUBUFLdsTable<0, NAME # "_OFFEN_gfx90a">;
- def _IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
- MUBUFLdsTable<0, NAME # "_IDXEN_gfx90a">;
- def _BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
- MUBUFLdsTable<0, NAME # "_BOTHEN_gfx90a">;
+ def _OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
- def _LDS_OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
- MUBUFLdsTable<1, NAME # "_OFFSET_gfx90a">;
- def _LDS_OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
- MUBUFLdsTable<1, NAME # "_OFFEN_gfx90a">;
- def _LDS_IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
- MUBUFLdsTable<1, NAME # "_IDXEN_gfx90a">;
- def _LDS_BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
- MUBUFLdsTable<1, NAME # "_BOTHEN_gfx90a">;
+ def _LDS_OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>;
+ def _LDS_OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>;
+ def _LDS_IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>;
+ def _LDS_BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>;
}
class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
@@ -2424,9 +2795,9 @@ defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_vi <0x11>;
defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_vi <0x12>;
defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_vi <0x13>;
defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_vi <0x14>;
-defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_Lds_vi <0x15>;
-defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_Lds_vi <0x16>;
-defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_Lds_vi <0x17>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_vi <0x15>;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_vi <0x16>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_vi <0x17>;
defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_vi <0x18>;
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x19>;
defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_vi <0x1a>;
@@ -2481,12 +2852,12 @@ def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
} // End AssemblerPredicate = isGFX8GFX9
-let SubtargetPredicate = HasAtomicFaddInsts in {
+let SubtargetPredicate = HasAtomicFaddNoRtnInsts in {
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>;
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>;
-} // End SubtargetPredicate = HasAtomicFaddInsts
+} // End SubtargetPredicate = HasAtomicFaddNoRtnInsts
let SubtargetPredicate = isGFX90APlus in {
defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>;
@@ -2495,9 +2866,17 @@ let SubtargetPredicate = isGFX90APlus in {
} // End SubtargetPredicate = isGFX90APlus, AssemblerPredicate = isGFX90APlus
def BUFFER_WBL2_gfx90a : MUBUF_Real_gfx90a<0x28, BUFFER_WBL2> {
+ let AsmString = BUFFER_WBL2.Mnemonic; // drop flags
+ let AssemblerPredicate = isGFX90AOnly;
+ let SubtargetPredicate = isGFX90AOnly;
}
def BUFFER_INVL2_gfx90a : MUBUF_Real_gfx90a<0x29, BUFFER_INVL2>;
+let SubtargetPredicate = isGFX940Plus in {
+def BUFFER_WBL2_gfx940 : MUBUF_Real_gfx940<0x28, BUFFER_WBL2>;
+def BUFFER_INV_gfx940 : MUBUF_Real_gfx940<0x29, BUFFER_INV>;
+}
+
class MTBUF_Real_Base_vi <bits<4> op, MTBUF_Pseudo ps, int Enc> :
MTBUF_Real<ps>,
Enc64,
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index c4043177b618..27b723875aa4 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -52,8 +52,8 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
let Uses = !if(has_m0_read, [M0, EXEC], [EXEC]);
}
-class DS_Real <DS_Pseudo ps> :
- InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> :
+ InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands>,
Enc64 {
let isPseudo = 0;
@@ -72,6 +72,9 @@ class DS_Real <DS_Pseudo ps> :
let IsAtomicRet = ps.IsAtomicRet;
let IsAtomicNoRet = ps.IsAtomicNoRet;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
// encoding fields
bits<10> vdst;
bits<1> gds;
@@ -172,6 +175,22 @@ multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterClass rc = VGPR_32> {
}
}
+class DS_0A1D_RET_GDS<string opName, RegisterClass rc = VGPR_32, RegisterClass src = rc,
+ RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
+ RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
+: DS_Pseudo<opName,
+ (outs dst_op:$vdst),
+ (ins src_op:$data0, offset:$offset),
+ " $vdst, $data0$offset gds"> {
+
+ let has_addr = 0;
+ let has_data1 = 0;
+ let has_gds = 0;
+ let gdsValue = 1;
+ let AsmMatchConverter = "cvtDSGds";
+ let hasSideEffects = 1;
+}
+
class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32,
RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
: DS_Pseudo<opName,
@@ -462,6 +481,22 @@ let SubtargetPredicate = isGFX90APlus in {
defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64, "ds_add_f64">;
} // End SubtargetPredicate = isGFX90APlus
+let SubtargetPredicate = isGFX940Plus in {
+ defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">;
+ defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">;
+ defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">;
+ defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">;
+} // End SubtargetPredicate = isGFX940Plus
+
+defm DS_CMPSTORE_B32 : DS_1A2D_NORET_mc<"ds_cmpstore_b32">;
+defm DS_CMPSTORE_F32 : DS_1A2D_NORET_mc<"ds_cmpstore_f32">;
+defm DS_CMPSTORE_B64 : DS_1A2D_NORET_mc<"ds_cmpstore_b64", VReg_64>;
+defm DS_CMPSTORE_F64 : DS_1A2D_NORET_mc<"ds_cmpstore_f64", VReg_64>;
+defm DS_CMPSTORE_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b32", VGPR_32, "ds_cmpstore_b32">;
+defm DS_CMPSTORE_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f32", VGPR_32, "ds_cmpstore_f32">;
+defm DS_CMPSTORE_RTN_B64 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b64", VReg_64, "ds_cmpstore_b64">;
+defm DS_CMPSTORE_RTN_F64 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f64", VReg_64, "ds_cmpstore_f64">;
+
defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">;
defm DS_CMPST_B32 : DS_1A2D_NORET_mc<"ds_cmpst_b32">;
defm DS_CMPST_F32 : DS_1A2D_NORET_mc<"ds_cmpst_f32">;
@@ -619,6 +654,8 @@ def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">;
def DS_CONSUME : DS_0A_RET<"ds_consume">;
def DS_APPEND : DS_0A_RET<"ds_append">;
+
+let SubtargetPredicate = isNotGFX90APlus in
def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
//===----------------------------------------------------------------------===//
@@ -667,6 +704,18 @@ let SubtargetPredicate = HasLDSFPAtomicAdd, OtherPredicates = [HasDsSrc2Insts] i
def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
}
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for GFX11 and newer.
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isGFX11Plus in {
+
+def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VReg_64, VGPR_32>;
+def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>;
+
+} // let SubtargetPredicate = isGFX11Plus
+
//===----------------------------------------------------------------------===//
// DS Patterns
//===----------------------------------------------------------------------===//
@@ -777,14 +826,14 @@ foreach vt = Reg32Types.types in {
defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">;
}
-defm : DSAtomicWritePat_mc <DS_WRITE_B8, i16, "atomic_store_local_8">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B8, i32, "atomic_store_local_8">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B16, i16, "atomic_store_local_16">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B16, i32, "atomic_store_local_16">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local_32">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local_64">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B8, i16, "atomic_store_8_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B8, i32, "atomic_store_8_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B16, i16, "atomic_store_16_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B16, i32, "atomic_store_16_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_32_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_64_local">;
-let OtherPredicates = [D16PreservesUnusedBits] in {
+let OtherPredicates = [HasD16LoadStore] in {
def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>;
def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>;
}
@@ -870,15 +919,30 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
let SubtargetPredicate = HasUnalignedAccessMode in {
-// FIXME: From performance point of view, is ds_read_b96/ds_write_b96 better choice
-// for unaligned accesses?
+// Select 64 bit loads and stores aligned less than 4 as a single ds_read_b64/
+// ds_write_b64 instruction as this is faster than ds_read2_b32/ds_write2_b32
+// which would be used otherwise. In this case a b32 access would still be
+// misaligned, but we will have 2 of them.
+foreach vt = VReg_64.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B64, vt, "load_align_less_than_4_local">;
+defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align_less_than_4_local">;
+}
+
+// Selection will split most of the unaligned 3 dword accesses due to performance
+// reasons when beneficial. Keep these two patterns for the rest of the cases.
foreach vt = VReg_96.RegTypes in {
defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;
}
-// For performance reasons, *do not* select ds_read_b128/ds_write_b128 for unaligned
-// accesses.
+// Select 128 bit loads and stores aligned less than 4 as a single ds_read_b128/
+// ds_write_b128 instruction as this is faster than ds_read2_b64/ds_write2_b64
+// which would be used otherwise. In this case a b64 access would still be
+// misaligned, but we will have 2 of them.
+foreach vt = VReg_128.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B128, vt, "load_align_less_than_4_local">;
+defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">;
+}
} // End SubtargetPredicate = HasUnalignedAccessMode
@@ -904,69 +968,143 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>;
}
+multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
+ ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>;
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>;
+ }
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>;
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>;
+ }
-class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>;
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>;
+}
+
+
+
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
+// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
+class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap),
(inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))
>;
-multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
+multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt,
+ string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
+ def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>;
+ def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
- def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_"#vt.Size)>;
+ def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>;
+ def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>;
}
- def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>;
+ def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>;
+ def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>;
}
+} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10
+let SubtargetPredicate = isGFX11Plus in {
+// The order of src and cmp agrees with the BUFFER_ATOMIC_CMPSWAP opcode.
+class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
+ (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap),
+ (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds))
+>;
+multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> {
+
+ def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>;
+ def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>;
+
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>;
+ def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>;
+}
+} // End SubtargetPredicate = isGFX11Plus
// 32-bit atomics.
defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap">;
-defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add">;
-defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub">;
-defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc">;
-defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec">;
-defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and">;
-defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or">;
-defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax">;
-defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap">;
+defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_U32, DS_ADD_U32, i32, "atomic_load_add">;
+defm : DSAtomicRetNoRetPat_mc<DS_SUB_RTN_U32, DS_SUB_U32, i32, "atomic_load_sub">;
+defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U32, DS_INC_U32, i32, "atomic_inc">;
+defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U32, DS_DEC_U32, i32, "atomic_dec">;
+defm : DSAtomicRetNoRetPat_mc<DS_AND_RTN_B32, DS_AND_B32, i32, "atomic_load_and">;
+defm : DSAtomicRetNoRetPat_mc<DS_OR_RTN_B32, DS_OR_B32, i32, "atomic_load_or">;
+defm : DSAtomicRetNoRetPat_mc<DS_XOR_RTN_B32, DS_XOR_B32, i32, "atomic_load_xor">;
+defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_I32, DS_MIN_I32, i32, "atomic_load_min">;
+defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_I32, DS_MAX_I32, i32, "atomic_load_max">;
+defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_U32, DS_MIN_U32, i32, "atomic_load_umin">;
+defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_U32, DS_MAX_U32, i32, "atomic_load_umax">;
+defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_F32, DS_MIN_F32, f32, "atomic_load_fmin">;
+defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_F32, DS_MAX_F32, f32, "atomic_load_fmax">;
+
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
+defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
+}
+
+let SubtargetPredicate = isGFX11Plus in {
+defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B32, DS_CMPSTORE_B32, i32, "atomic_cmp_swap">;
+}
let SubtargetPredicate = HasLDSFPAtomicAdd in {
-defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd">;
+defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_F32, DS_ADD_F32, f32, "atomic_load_fadd">;
}
// 64-bit atomics.
defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap">;
-defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add">;
-defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub">;
-defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc">;
-defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec">;
-defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and">;
-defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or">;
-defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_F64, f64, "atomic_load_fmin">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_F64, f64, "atomic_load_fmax">;
+defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_U64, DS_ADD_U64, i64, "atomic_load_add">;
+defm : DSAtomicRetNoRetPat_mc<DS_SUB_RTN_U64, DS_SUB_U64, i64, "atomic_load_sub">;
+defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U64, DS_INC_U64, i64, "atomic_inc">;
+defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U64, DS_DEC_U64, i64, "atomic_dec">;
+defm : DSAtomicRetNoRetPat_mc<DS_AND_RTN_B64, DS_AND_B64, i64, "atomic_load_and">;
+defm : DSAtomicRetNoRetPat_mc<DS_OR_RTN_B64, DS_OR_B64, i64, "atomic_load_or">;
+defm : DSAtomicRetNoRetPat_mc<DS_XOR_RTN_B64, DS_XOR_B64, i64, "atomic_load_xor">;
+defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_I64, DS_MIN_I64, i64, "atomic_load_min">;
+defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_I64, DS_MAX_I64, i64, "atomic_load_max">;
+defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_U64, DS_MIN_U64, i64, "atomic_load_umin">;
+defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_U64, DS_MAX_U64, i64, "atomic_load_umax">;
+defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_F64, DS_MIN_F64, f64, "atomic_load_fmin">;
+defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_F64, DS_MAX_F64, f64, "atomic_load_fmax">;
+
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
+defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B64, DS_CMPST_B64, i64, "atomic_cmp_swap">;
+} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10
-defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap">;
+let SubtargetPredicate = isGFX11Plus in {
+defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B64, DS_CMPSTORE_B64, i64, "atomic_cmp_swap">;
+} // End SubtargetPredicate = isGFX11Plus
let SubtargetPredicate = isGFX90APlus in {
-def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>;
+def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_ret_64>;
+def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>;
+}
+
+let SubtargetPredicate = isGFX940Plus in {
+def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_ret_32>;
+def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>;
+def : GCNPat <
+ (v2i16 (int_amdgcn_ds_fadd_v2bf16_ret i32:$ptr, v2i16:$src)),
+ (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
+>;
+def : GCNPat <
+ (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
+ (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
+>;
}
def : Pat <
@@ -974,16 +1112,44 @@ def : Pat <
(DS_ORDERED_COUNT $value, (as_i16imm $offset))
>;
+def : GCNPat <
+ (i64 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)),
+ (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32))
+>;
+
+def : GCNPat <
+ (i32 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)),
+ (EXTRACT_SUBREG
+ (i64 (COPY_TO_REGCLASS
+ (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)),
+ VReg_64)),
+ sub0)
+>;
+
+def : GCNPat <
+ (i64 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)),
+ (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32))
+>;
+
+def : GCNPat <
+ (i32 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)),
+ (EXTRACT_SUBREG
+ (i64 (COPY_TO_REGCLASS
+ (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)),
+ VReg_64)),
+ sub0)
+>;
+
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// Base ENC_DS for GFX6, GFX7, GFX10.
+// Base ENC_DS for GFX6, GFX7, GFX10, GFX11.
//===----------------------------------------------------------------------===//
-class Base_DS_Real_gfx6_gfx7_gfx10<bits<8> op, DS_Pseudo ps, int ef> :
- DS_Real<ps>, SIMCInstr <ps.Mnemonic, ef> {
+class Base_DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op, DS_Pseudo ps, int ef, string opName = ps.Mnemonic> :
+ DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> {
let Inst{7-0} = !if(ps.has_offset0, offset0, 0);
let Inst{15-8} = !if(ps.has_offset1, offset1, 0);
@@ -997,19 +1163,89 @@ class Base_DS_Real_gfx6_gfx7_gfx10<bits<8> op, DS_Pseudo ps, int ef> :
}
//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" in {
+ multiclass DS_Real_gfx11<bits<8> op> {
+ def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME),
+ SIEncodingFamily.GFX11>;
+ }
+
+ multiclass DS_Real_Renamed_gfx11<bits<8> op, DS_Pseudo backing_pseudo, string real_name> {
+ def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, backing_pseudo, SIEncodingFamily.GFX11, real_name>,
+ MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+ }
+} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11"
+
+defm DS_STORE_B32 : DS_Real_Renamed_gfx11<0x00d, DS_WRITE_B32, "ds_store_b32">;
+defm DS_STORE_2ADDR_B32 : DS_Real_Renamed_gfx11<0x00e, DS_WRITE2_B32, "ds_store_2addr_b32">;
+defm DS_STORE_2ADDR_STRIDE64_B32 : DS_Real_Renamed_gfx11<0x00f, DS_WRITE2ST64_B32, "ds_store_2addr_stride64_b32">;
+defm DS_STORE_B8 : DS_Real_Renamed_gfx11<0x01e, DS_WRITE_B8, "ds_store_b8">;
+defm DS_STORE_B16 : DS_Real_Renamed_gfx11<0x01f, DS_WRITE_B16, "ds_store_b16">;
+defm DS_STOREXCHG_RTN_B32 : DS_Real_Renamed_gfx11<0x02d, DS_WRXCHG_RTN_B32, "ds_storexchg_rtn_b32">;
+defm DS_STOREXCHG_2ADDR_RTN_B32 : DS_Real_Renamed_gfx11<0x02e, DS_WRXCHG2_RTN_B32, "ds_storexchg_2addr_rtn_b32">;
+defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32 : DS_Real_Renamed_gfx11<0x02f, DS_WRXCHG2ST64_RTN_B32, "ds_storexchg_2addr_stride64_rtn_b32">;
+defm DS_LOAD_B32 : DS_Real_Renamed_gfx11<0x036, DS_READ_B32, "ds_load_b32">;
+defm DS_LOAD_2ADDR_B32 : DS_Real_Renamed_gfx11<0x037, DS_READ2_B32, "ds_load_2addr_b32">;
+defm DS_LOAD_2ADDR_STRIDE64_B32 : DS_Real_Renamed_gfx11<0x038, DS_READ2ST64_B32, "ds_load_2addr_stride64_b32">;
+defm DS_LOAD_I8 : DS_Real_Renamed_gfx11<0x039, DS_READ_I8, "ds_load_i8">;
+defm DS_LOAD_U8 : DS_Real_Renamed_gfx11<0x03a, DS_READ_U8, "ds_load_u8">;
+defm DS_LOAD_I16 : DS_Real_Renamed_gfx11<0x03b, DS_READ_I16, "ds_load_i16">;
+defm DS_LOAD_U16 : DS_Real_Renamed_gfx11<0x03c, DS_READ_U16, "ds_load_u16">;
+defm DS_STORE_B64 : DS_Real_Renamed_gfx11<0x04d, DS_WRITE_B64, "ds_store_b64">;
+defm DS_STORE_2ADDR_B64 : DS_Real_Renamed_gfx11<0x04e, DS_WRITE2_B64, "ds_store_2addr_b64">;
+defm DS_STORE_2ADDR_STRIDE64_B64 : DS_Real_Renamed_gfx11<0x04f, DS_WRITE2ST64_B64, "ds_store_2addr_stride64_b64">;
+defm DS_STOREXCHG_RTN_B64 : DS_Real_Renamed_gfx11<0x06d, DS_WRXCHG_RTN_B64, "ds_storexchg_rtn_b64">;
+defm DS_STOREXCHG_2ADDR_RTN_B64 : DS_Real_Renamed_gfx11<0x06e, DS_WRXCHG2_RTN_B64, "ds_storexchg_2addr_rtn_b64">;
+defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64 : DS_Real_Renamed_gfx11<0x06f, DS_WRXCHG2ST64_RTN_B64, "ds_storexchg_2addr_stride64_rtn_b64">;
+defm DS_LOAD_B64 : DS_Real_Renamed_gfx11<0x076, DS_READ_B64, "ds_load_b64">;
+defm DS_LOAD_2ADDR_B64 : DS_Real_Renamed_gfx11<0x077, DS_READ2_B64, "ds_load_2addr_b64">;
+defm DS_LOAD_2ADDR_STRIDE64_B64 : DS_Real_Renamed_gfx11<0x078, DS_READ2ST64_B64, "ds_load_2addr_stride64_b64">;
+defm DS_STORE_B8_D16_HI : DS_Real_Renamed_gfx11<0x0a0, DS_WRITE_B8_D16_HI, "ds_store_b8_d16_hi">;
+defm DS_STORE_B16_D16_HI : DS_Real_Renamed_gfx11<0x0a1, DS_WRITE_B16_D16_HI, "ds_store_b16_d16_hi">;
+defm DS_LOAD_U8_D16 : DS_Real_Renamed_gfx11<0x0a2, DS_READ_U8_D16, "ds_load_u8_d16">;
+defm DS_LOAD_U8_D16_HI : DS_Real_Renamed_gfx11<0x0a3, DS_READ_U8_D16_HI, "ds_load_u8_d16_hi">;
+defm DS_LOAD_I8_D16 : DS_Real_Renamed_gfx11<0x0a4, DS_READ_I8_D16, "ds_load_i8_d16">;
+defm DS_LOAD_I8_D16_HI : DS_Real_Renamed_gfx11<0x0a5, DS_READ_I8_D16_HI, "ds_load_i8_d16_hi">;
+defm DS_LOAD_U16_D16 : DS_Real_Renamed_gfx11<0x0a6, DS_READ_U16_D16, "ds_load_u16_d16">;
+defm DS_LOAD_U16_D16_HI : DS_Real_Renamed_gfx11<0x0a7, DS_READ_U16_D16_HI, "ds_load_u16_d16_hi">;
+defm DS_STORE_ADDTID_B32 : DS_Real_Renamed_gfx11<0x0b0, DS_WRITE_ADDTID_B32, "ds_store_addtid_b32">;
+defm DS_LOAD_ADDTID_B32 : DS_Real_Renamed_gfx11<0x0b1, DS_READ_ADDTID_B32, "ds_load_addtid_b32">;
+defm DS_STORE_B96 : DS_Real_Renamed_gfx11<0x0de, DS_WRITE_B96, "ds_store_b96">;
+defm DS_STORE_B128 : DS_Real_Renamed_gfx11<0x0df, DS_WRITE_B128, "ds_store_b128">;
+defm DS_LOAD_B96 : DS_Real_Renamed_gfx11<0x0fe, DS_READ_B96, "ds_load_b96">;
+defm DS_LOAD_B128 : DS_Real_Renamed_gfx11<0x0ff, DS_READ_B128, "ds_load_b128">;
+
+// DS_CMPST_* are renamed to DS_CMPSTORE_* in GFX11, but also the data operands (src and cmp) are swapped
+// comparing to pre-GFX11.
+// Note: the mnemonic alias is not generated to avoid a potential ambiguity due to the semantics change.
+
+defm DS_CMPSTORE_B32 : DS_Real_gfx11<0x010>;
+defm DS_CMPSTORE_F32 : DS_Real_gfx11<0x011>;
+defm DS_CMPSTORE_RTN_B32 : DS_Real_gfx11<0x030>;
+defm DS_CMPSTORE_RTN_F32 : DS_Real_gfx11<0x031>;
+defm DS_CMPSTORE_B64 : DS_Real_gfx11<0x050>;
+defm DS_CMPSTORE_F64 : DS_Real_gfx11<0x051>;
+defm DS_CMPSTORE_RTN_B64 : DS_Real_gfx11<0x070>;
+defm DS_CMPSTORE_RTN_F64 : DS_Real_gfx11<0x071>;
+
+defm DS_ADD_RTN_F32 : DS_Real_gfx11<0x079>;
+defm DS_ADD_GS_REG_RTN : DS_Real_gfx11<0x07a>;
+defm DS_SUB_GS_REG_RTN : DS_Real_gfx11<0x07b>;
+
+//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
multiclass DS_Real_gfx10<bits<8> op> {
- def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+ def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME),
SIEncodingFamily.GFX10>;
}
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
-defm DS_ADD_F32 : DS_Real_gfx10<0x015>;
defm DS_ADD_RTN_F32 : DS_Real_gfx10<0x055>;
-defm DS_ADD_SRC2_F32 : DS_Real_gfx10<0x095>;
defm DS_WRITE_B8_D16_HI : DS_Real_gfx10<0x0a0>;
defm DS_WRITE_B16_D16_HI : DS_Real_gfx10<0x0a1>;
defm DS_READ_U8_D16 : DS_Real_gfx10<0x0a2>;
@@ -1020,95 +1256,118 @@ defm DS_READ_U16_D16 : DS_Real_gfx10<0x0a6>;
defm DS_READ_U16_D16_HI : DS_Real_gfx10<0x0a7>;
defm DS_WRITE_ADDTID_B32 : DS_Real_gfx10<0x0b0>;
defm DS_READ_ADDTID_B32 : DS_Real_gfx10<0x0b1>;
-defm DS_PERMUTE_B32 : DS_Real_gfx10<0x0b2>;
-defm DS_BPERMUTE_B32 : DS_Real_gfx10<0x0b3>;
//===----------------------------------------------------------------------===//
-// GFX7, GFX10.
+// GFX10, GFX11.
+//===----------------------------------------------------------------------===//
+
+multiclass DS_Real_gfx10_gfx11<bits<8> op> :
+ DS_Real_gfx10<op>, DS_Real_gfx11<op>;
+
+defm DS_ADD_F32 : DS_Real_gfx10_gfx11<0x015>;
+defm DS_ADD_SRC2_F32 : DS_Real_gfx10<0x095>;
+defm DS_PERMUTE_B32 : DS_Real_gfx10_gfx11<0x0b2>;
+defm DS_BPERMUTE_B32 : DS_Real_gfx10_gfx11<0x0b3>;
+
+//===----------------------------------------------------------------------===//
+// GFX7, GFX10, GFX11.
//===----------------------------------------------------------------------===//
let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
multiclass DS_Real_gfx7<bits<8> op> {
- def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+ def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME),
SIEncodingFamily.SI>;
}
} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
+multiclass DS_Real_gfx7_gfx10_gfx11<bits<8> op> :
+ DS_Real_gfx7<op>, DS_Real_gfx10_gfx11<op>;
+
multiclass DS_Real_gfx7_gfx10<bits<8> op> :
DS_Real_gfx7<op>, DS_Real_gfx10<op>;
// FIXME-GFX7: Add tests when upstreaming this part.
-defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10<0x018>;
-defm DS_WRAP_RTN_B32 : DS_Real_gfx7_gfx10<0x034>;
-defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10<0x07e>;
+defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10_gfx11<0x018>;
+defm DS_WRAP_RTN_B32 : DS_Real_gfx7_gfx10_gfx11<0x034>;
+defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10_gfx11<0x07e>;
defm DS_WRITE_B96 : DS_Real_gfx7_gfx10<0x0de>;
defm DS_WRITE_B128 : DS_Real_gfx7_gfx10<0x0df>;
defm DS_READ_B96 : DS_Real_gfx7_gfx10<0x0fe>;
defm DS_READ_B128 : DS_Real_gfx7_gfx10<0x0ff>;
//===----------------------------------------------------------------------===//
-// GFX6, GFX7, GFX10.
+// GFX6, GFX7, GFX10, GFX11.
//===----------------------------------------------------------------------===//
let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
multiclass DS_Real_gfx6_gfx7<bits<8> op> {
- def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+ def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME),
SIEncodingFamily.SI>;
}
} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+multiclass DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op> :
+ DS_Real_gfx6_gfx7<op>, DS_Real_gfx10_gfx11<op>;
+
multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op> :
DS_Real_gfx6_gfx7<op>, DS_Real_gfx10<op>;
-defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10<0x000>;
-defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x001>;
-defm DS_RSUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x002>;
-defm DS_INC_U32 : DS_Real_gfx6_gfx7_gfx10<0x003>;
-defm DS_DEC_U32 : DS_Real_gfx6_gfx7_gfx10<0x004>;
-defm DS_MIN_I32 : DS_Real_gfx6_gfx7_gfx10<0x005>;
-defm DS_MAX_I32 : DS_Real_gfx6_gfx7_gfx10<0x006>;
-defm DS_MIN_U32 : DS_Real_gfx6_gfx7_gfx10<0x007>;
-defm DS_MAX_U32 : DS_Real_gfx6_gfx7_gfx10<0x008>;
-defm DS_AND_B32 : DS_Real_gfx6_gfx7_gfx10<0x009>;
-defm DS_OR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00a>;
-defm DS_XOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00b>;
-defm DS_MSKOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00c>;
+defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x000>;
+defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x001>;
+defm DS_RSUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x002>;
+defm DS_INC_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x003>;
+defm DS_DEC_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x004>;
+defm DS_MIN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x005>;
+defm DS_MAX_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x006>;
+defm DS_MIN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x007>;
+defm DS_MAX_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x008>;
+defm DS_AND_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x009>;
+defm DS_OR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00a>;
+defm DS_XOR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00b>;
+defm DS_MSKOR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00c>;
+
defm DS_WRITE_B32 : DS_Real_gfx6_gfx7_gfx10<0x00d>;
defm DS_WRITE2_B32 : DS_Real_gfx6_gfx7_gfx10<0x00e>;
defm DS_WRITE2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x00f>;
defm DS_CMPST_B32 : DS_Real_gfx6_gfx7_gfx10<0x010>;
defm DS_CMPST_F32 : DS_Real_gfx6_gfx7_gfx10<0x011>;
-defm DS_MIN_F32 : DS_Real_gfx6_gfx7_gfx10<0x012>;
-defm DS_MAX_F32 : DS_Real_gfx6_gfx7_gfx10<0x013>;
-defm DS_NOP : DS_Real_gfx6_gfx7_gfx10<0x014>;
-defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10<0x019>;
-defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10<0x01a>;
-defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10<0x01b>;
-defm DS_GWS_SEMA_P : DS_Real_gfx6_gfx7_gfx10<0x01c>;
-defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10<0x01d>;
+
+defm DS_MIN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x012>;
+defm DS_MAX_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x013>;
+defm DS_NOP : DS_Real_gfx6_gfx7_gfx10_gfx11<0x014>;
+defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x019>;
+defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01a>;
+defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01b>;
+defm DS_GWS_SEMA_P : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01c>;
+defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01d>;
+
defm DS_WRITE_B8 : DS_Real_gfx6_gfx7_gfx10<0x01e>;
defm DS_WRITE_B16 : DS_Real_gfx6_gfx7_gfx10<0x01f>;
-defm DS_ADD_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x020>;
-defm DS_SUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x021>;
-defm DS_RSUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x022>;
-defm DS_INC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x023>;
-defm DS_DEC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x024>;
-defm DS_MIN_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x025>;
-defm DS_MAX_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x026>;
-defm DS_MIN_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x027>;
-defm DS_MAX_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x028>;
-defm DS_AND_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x029>;
-defm DS_OR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02a>;
-defm DS_XOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02b>;
-defm DS_MSKOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02c>;
+
+defm DS_ADD_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x020>;
+defm DS_SUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x021>;
+defm DS_RSUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x022>;
+defm DS_INC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x023>;
+defm DS_DEC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x024>;
+defm DS_MIN_RTN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x025>;
+defm DS_MAX_RTN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x026>;
+defm DS_MIN_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x027>;
+defm DS_MAX_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x028>;
+defm DS_AND_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x029>;
+defm DS_OR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02a>;
+defm DS_XOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02b>;
+defm DS_MSKOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02c>;
+
defm DS_WRXCHG_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02d>;
defm DS_WRXCHG2_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02e>;
defm DS_WRXCHG2ST64_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02f>;
defm DS_CMPST_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x030>;
defm DS_CMPST_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x031>;
-defm DS_MIN_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x032>;
-defm DS_MAX_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x033>;
-defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10<0x035>;
+
+defm DS_MIN_RTN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x032>;
+defm DS_MAX_RTN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x033>;
+defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x035>;
+
defm DS_READ_B32 : DS_Real_gfx6_gfx7_gfx10<0x036>;
defm DS_READ2_B32 : DS_Real_gfx6_gfx7_gfx10<0x037>;
defm DS_READ2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x038>;
@@ -1116,49 +1375,55 @@ defm DS_READ_I8 : DS_Real_gfx6_gfx7_gfx10<0x039>;
defm DS_READ_U8 : DS_Real_gfx6_gfx7_gfx10<0x03a>;
defm DS_READ_I16 : DS_Real_gfx6_gfx7_gfx10<0x03b>;
defm DS_READ_U16 : DS_Real_gfx6_gfx7_gfx10<0x03c>;
-defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10<0x03d>;
-defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10<0x03e>;
-defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10<0x03f>;
-defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10<0x040>;
-defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x041>;
-defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x042>;
-defm DS_INC_U64 : DS_Real_gfx6_gfx7_gfx10<0x043>;
-defm DS_DEC_U64 : DS_Real_gfx6_gfx7_gfx10<0x044>;
-defm DS_MIN_I64 : DS_Real_gfx6_gfx7_gfx10<0x045>;
-defm DS_MAX_I64 : DS_Real_gfx6_gfx7_gfx10<0x046>;
-defm DS_MIN_U64 : DS_Real_gfx6_gfx7_gfx10<0x047>;
-defm DS_MAX_U64 : DS_Real_gfx6_gfx7_gfx10<0x048>;
-defm DS_AND_B64 : DS_Real_gfx6_gfx7_gfx10<0x049>;
-defm DS_OR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04a>;
-defm DS_XOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04b>;
-defm DS_MSKOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04c>;
+
+defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03d>;
+defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03e>;
+defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03f>;
+defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x040>;
+defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x041>;
+defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x042>;
+defm DS_INC_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x043>;
+defm DS_DEC_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x044>;
+defm DS_MIN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x045>;
+defm DS_MAX_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x046>;
+defm DS_MIN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x047>;
+defm DS_MAX_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x048>;
+defm DS_AND_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x049>;
+defm DS_OR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04a>;
+defm DS_XOR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04b>;
+defm DS_MSKOR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04c>;
+
defm DS_WRITE_B64 : DS_Real_gfx6_gfx7_gfx10<0x04d>;
defm DS_WRITE2_B64 : DS_Real_gfx6_gfx7_gfx10<0x04e>;
defm DS_WRITE2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x04f>;
defm DS_CMPST_B64 : DS_Real_gfx6_gfx7_gfx10<0x050>;
defm DS_CMPST_F64 : DS_Real_gfx6_gfx7_gfx10<0x051>;
-defm DS_MIN_F64 : DS_Real_gfx6_gfx7_gfx10<0x052>;
-defm DS_MAX_F64 : DS_Real_gfx6_gfx7_gfx10<0x053>;
-defm DS_ADD_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x060>;
-defm DS_SUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x061>;
-defm DS_RSUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x062>;
-defm DS_INC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x063>;
-defm DS_DEC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x064>;
-defm DS_MIN_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x065>;
-defm DS_MAX_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x066>;
-defm DS_MIN_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x067>;
-defm DS_MAX_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x068>;
-defm DS_AND_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x069>;
-defm DS_OR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06a>;
-defm DS_XOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06b>;
-defm DS_MSKOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06c>;
+
+defm DS_MIN_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x052>;
+defm DS_MAX_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x053>;
+defm DS_ADD_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x060>;
+defm DS_SUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x061>;
+defm DS_RSUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x062>;
+defm DS_INC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x063>;
+defm DS_DEC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x064>;
+defm DS_MIN_RTN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x065>;
+defm DS_MAX_RTN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x066>;
+defm DS_MIN_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x067>;
+defm DS_MAX_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x068>;
+defm DS_AND_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x069>;
+defm DS_OR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06a>;
+defm DS_XOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06b>;
+defm DS_MSKOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06c>;
+
defm DS_WRXCHG_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06d>;
defm DS_WRXCHG2_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06e>;
defm DS_WRXCHG2ST64_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06f>;
defm DS_CMPST_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x070>;
defm DS_CMPST_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x071>;
-defm DS_MIN_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x072>;
-defm DS_MAX_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x073>;
+
+defm DS_MIN_RTN_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x072>;
+defm DS_MAX_RTN_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x073>;
+
defm DS_READ_B64 : DS_Real_gfx6_gfx7_gfx10<0x076>;
defm DS_READ2_B64 : DS_Real_gfx6_gfx7_gfx10<0x077>;
defm DS_READ2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x078>;
@@ -1381,3 +1646,10 @@ let SubtargetPredicate = isGFX90APlus in {
def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>;
def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
} // End SubtargetPredicate = isGFX90APlus
+
+let SubtargetPredicate = isGFX940Plus in {
+ def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>;
+ def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
+ def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
+ def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
+} // End SubtargetPredicate = isGFX940Plus
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index e2186d4d533e..ccaf646008b1 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -18,15 +18,20 @@
#include "Disassembler/AMDGPUDisassembler.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
+#include "SIRegisterInfo.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm-c/DisassemblerTypes.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDecoderOps.h"
#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
-#include "llvm/MC/TargetRegistry.h"
#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
using namespace llvm;
@@ -70,7 +75,8 @@ static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op,
}
static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
// Our branches take a simm16, but we need two extra bits to account for the
@@ -78,13 +84,13 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
APInt SignedOffset(18, Imm * 4, true);
int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue();
- if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2))
+ if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2, 0))
return MCDisassembler::Success;
return addOperand(Inst, MCOperand::createImm(Imm));
}
-static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, uint64_t Addr,
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
int64_t Offset;
if (DAsm->isVI()) { // VI supports 20-bit unsigned offsets.
@@ -95,20 +101,19 @@ static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm,
return addOperand(Inst, MCOperand::createImm(Offset));
}
-static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val,
- uint64_t Addr, const void *Decoder) {
+static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr,
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeBoolReg(Val));
}
-#define DECODE_OPERAND(StaticDecoderName, DecoderName) \
-static DecodeStatus StaticDecoderName(MCInst &Inst, \
- unsigned Imm, \
- uint64_t /*Addr*/, \
- const void *Decoder) { \
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \
- return addOperand(Inst, DAsm->DecoderName(Imm)); \
-}
+#define DECODE_OPERAND(StaticDecoderName, DecoderName) \
+ static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm, \
+ uint64_t /*Addr*/, \
+ const MCDisassembler *Decoder) { \
+ auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); \
+ return addOperand(Inst, DAsm->DecoderName(Imm)); \
+ }
#define DECODE_OPERAND_REG(RegClass) \
DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass)
@@ -144,155 +149,151 @@ DECODE_OPERAND_REG(AReg_512)
DECODE_OPERAND_REG(AReg_1024)
DECODE_OPERAND_REG(AV_32)
DECODE_OPERAND_REG(AV_64)
+DECODE_OPERAND_REG(AV_128)
+DECODE_OPERAND_REG(AVDst_128)
+DECODE_OPERAND_REG(AVDst_512)
-static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
}
-static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
- unsigned Imm,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
}
-static DecodeStatus decodeOperand_VSrcV232(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_VSrcV232(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeOperand_VSrcV232(Imm));
}
-static DecodeStatus decodeOperand_VS_16(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_VS_16(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
}
-static DecodeStatus decodeOperand_VS_32(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_VS_32(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm));
}
-static DecodeStatus decodeOperand_AReg_64(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_AReg_64(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm | 512));
}
-static DecodeStatus decodeOperand_AReg_128(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_AReg_128(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512));
}
-static DecodeStatus decodeOperand_AReg_256(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_AReg_256(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm | 512));
}
-static DecodeStatus decodeOperand_AReg_512(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_AReg_512(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm | 512));
}
-static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512));
}
-static DecodeStatus decodeOperand_VReg_64(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_VReg_64(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm));
}
-static DecodeStatus decodeOperand_VReg_128(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_VReg_128(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm));
}
-static DecodeStatus decodeOperand_VReg_256(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_VReg_256(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm));
}
-static DecodeStatus decodeOperand_VReg_512(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_VReg_512(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm));
}
-static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm));
}
static DecodeStatus decodeOperand_f32kimm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
}
static DecodeStatus decodeOperand_f16kimm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
}
-static DecodeStatus decodeOperand_VS_16_Deferred(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+decodeOperand_VS_16_Deferred(MCInst &Inst, unsigned Imm, uint64_t Addr,
+ const MCDisassembler *Decoder) {
const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
return addOperand(
Inst, DAsm->decodeSrcOp(llvm::AMDGPUDisassembler::OPW16, Imm, true));
}
-static DecodeStatus decodeOperand_VS_32_Deferred(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+decodeOperand_VS_32_Deferred(MCInst &Inst, unsigned Imm, uint64_t Addr,
+ const MCDisassembler *Decoder) {
const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
return addOperand(
Inst, DAsm->decodeSrcOp(llvm::AMDGPUDisassembler::OPW32, Imm, true));
}
+static DecodeStatus decodeOperandVOPDDstY(MCInst &Inst, unsigned Val,
+ uint64_t Addr, const void *Decoder) {
+ const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+ return addOperand(Inst, DAsm->decodeVOPDDstYOp(Inst, Val));
+}
+
static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
const MCRegisterInfo *MRI) {
if (OpIdx < 0)
@@ -307,10 +308,9 @@ static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255;
}
-static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm,
AMDGPUDisassembler::OpWidthTy Opw,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
if (!DAsm->isGFX90A()) {
Imm &= 511;
@@ -342,54 +342,41 @@ static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst,
return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
}
-static DecodeStatus DecodeAVLdSt_32RegisterClass(MCInst &Inst,
- unsigned Imm,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeAVLdSt_32RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
+ const MCDisassembler *Decoder) {
return decodeOperand_AVLdSt_Any(Inst, Imm,
AMDGPUDisassembler::OPW32, Decoder);
}
-static DecodeStatus DecodeAVLdSt_64RegisterClass(MCInst &Inst,
- unsigned Imm,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeAVLdSt_64RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
+ const MCDisassembler *Decoder) {
return decodeOperand_AVLdSt_Any(Inst, Imm,
AMDGPUDisassembler::OPW64, Decoder);
}
-static DecodeStatus DecodeAVLdSt_96RegisterClass(MCInst &Inst,
- unsigned Imm,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeAVLdSt_96RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
+ const MCDisassembler *Decoder) {
return decodeOperand_AVLdSt_Any(Inst, Imm,
AMDGPUDisassembler::OPW96, Decoder);
}
-static DecodeStatus DecodeAVLdSt_128RegisterClass(MCInst &Inst,
- unsigned Imm,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeAVLdSt_128RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
+ const MCDisassembler *Decoder) {
return decodeOperand_AVLdSt_Any(Inst, Imm,
AMDGPUDisassembler::OPW128, Decoder);
}
-static DecodeStatus decodeOperand_SReg_32(MCInst &Inst,
- unsigned Imm,
+static DecodeStatus decodeOperand_SReg_32(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
return addOperand(Inst, DAsm->decodeOperand_SReg_32(Imm));
}
-static DecodeStatus decodeOperand_VGPR_32(MCInst &Inst,
- unsigned Imm,
- uint64_t Addr,
- const void *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW32, Imm));
-}
-
#define DECODE_SDWA(DecName) \
DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
@@ -410,21 +397,15 @@ template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
return Res;
}
-DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table,
- MCInst &MI,
- uint64_t Inst,
- uint64_t Address) const {
- assert(MI.getOpcode() == 0);
- assert(MI.getNumOperands() == 0);
- MCInst TmpInst;
- HasLiteral = false;
- const auto SavedBytes = Bytes;
- if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) {
- MI = TmpInst;
- return MCDisassembler::Success;
- }
- Bytes = SavedBytes;
- return MCDisassembler::Fail;
+static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+ assert(Bytes.size() >= 12);
+ uint64_t Lo = support::endian::read<uint64_t, support::endianness::little>(
+ Bytes.data());
+ Bytes = Bytes.slice(8);
+ uint64_t Hi = support::endian::read<uint32_t, support::endianness::little>(
+ Bytes.data());
+ Bytes = Bytes.slice(4);
+ return DecoderUInt128(Lo, Hi);
}
// The disassembler is greedy, so we need to check FI operand value to
@@ -457,6 +438,29 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
// encodings
+ if (isGFX11Plus() && Bytes.size() >= 12 ) {
+ DecoderUInt128 DecW = eat12Bytes(Bytes);
+ Res = tryDecodeInst(DecoderTableDPP8GFX1196, MI, DecW,
+ Address);
+ if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+ break;
+ MI = MCInst(); // clear
+ Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW,
+ Address);
+ if (Res) {
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
+ convertVOP3PDPPInst(MI);
+ else if (AMDGPU::isVOPC64DPP(MI.getOpcode()))
+ convertVOPCDPPInst(MI);
+ break;
+ }
+ Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address);
+ if (Res)
+ break;
+ }
+ // Reinitialize Bytes
+ Bytes = Bytes_.slice(0, MaxInstBytesNum);
+
if (Bytes.size() >= 8) {
const uint64_t QW = eatBytes<uint64_t>(Bytes);
@@ -475,12 +479,23 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address);
if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
break;
+ MI = MCInst(); // clear
+ Res = tryDecodeInst(DecoderTableDPP8GFX1164, MI, QW, Address);
+ if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+ break;
MI = MCInst(); // clear
Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address);
if (Res) break;
+ Res = tryDecodeInst(DecoderTableDPPGFX1164, MI, QW, Address);
+ if (Res) {
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
+ convertVOPCDPPInst(MI);
+ break;
+ }
+
Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
if (Res) { IsSDWA = true; break; }
@@ -535,6 +550,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address);
if (Res) break;
+ Res = tryDecodeInst(DecoderTableGFX1132, MI, DW, Address);
+ if (Res) break;
+
if (Bytes.size() < 4) break;
const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
@@ -554,6 +572,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Res) break;
Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address);
+ if (Res) break;
+
+ Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address);
+ if (Res)
+ break;
+
+ Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address);
} while (false);
if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
@@ -565,8 +590,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
MI.getOpcode() == AMDGPU::V_FMAC_F64_e64_gfx90a ||
MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi ||
MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx11 ||
MI.getOpcode() == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
- MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx10)) {
+ MI.getOpcode() == AMDGPU::V_FMAC_DX9_ZERO_F32_e64_gfx11 ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx10 ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx11)) {
// Insert dummy unused src2_modifiers.
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src2_modifiers);
@@ -625,8 +653,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = MCDisassembler::Fail;
} else {
for (unsigned i = 0; i < NSAArgs; ++i) {
- MI.insert(MI.begin() + VAddr0Idx + 1 + i,
- decodeOperand_VGPR_32(Bytes[i]));
+ const unsigned VAddrIdx = VAddr0Idx + 1 + i;
+ auto VAddrRCID = MCII->get(MI.getOpcode()).OpInfo[VAddrIdx].RegClass;
+ MI.insert(MI.begin() + VAddrIdx,
+ createRegOperand(VAddrRCID, Bytes[i]));
}
Bytes = Bytes.slice(4 * NSAWords);
}
@@ -636,6 +666,12 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = convertMIMGInst(MI);
}
+ if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP))
+ Res = convertEXPInst(MI);
+
+ if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP))
+ Res = convertVINTERPInst(MI);
+
if (Res && IsSDWA)
Res = convertSDWAInst(MI);
@@ -667,6 +703,28 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
return Res;
}
+DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX11]) {
+ // The MCInst still has these fields even though they are no longer encoded
+ // in the GFX11 instruction.
+ insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm);
+ insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr);
+ }
+ return MCDisassembler::Success;
+}
+
+DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
+ if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 ||
+ MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 ||
+ MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx11 ||
+ MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11) {
+ // The MCInst has this field that is not directly encoded in the
+ // instruction.
+ insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
+ }
+ return MCDisassembler::Success;
+}
+
DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
@@ -692,18 +750,23 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+ if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
+ convertVOP3PDPPInst(MI);
+ } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
+ AMDGPU::isVOPC64DPP(Opc)) {
+ convertVOPCDPPInst(MI);
+ } else {
+ // Insert dummy unused src modifiers.
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src0_modifiers);
- // Insert dummy unused src modifiers.
- if (MI.getNumOperands() < DescNumOps &&
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1)
- insertNamedMCOperand(MI, MCOperand::createImm(0),
- AMDGPU::OpName::src0_modifiers);
-
- if (MI.getNumOperands() < DescNumOps &&
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1)
- insertNamedMCOperand(MI, MCOperand::createImm(0),
- AMDGPU::OpName::src1_modifiers);
-
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src1_modifiers);
+ }
return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
}
@@ -745,7 +808,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
bool IsNSA = false;
unsigned AddrSize = Info->VAddrDwords;
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
+ if (isGFX10Plus()) {
unsigned DimIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
int A16Idx =
@@ -757,7 +820,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
AddrSize =
AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, AMDGPU::hasG16(STI));
- IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA;
+ IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA ||
+ Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA;
if (!IsNSA) {
if (AddrSize > 8)
AddrSize = 16;
@@ -808,9 +872,9 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
}
}
+ // If not using NSA on GFX10+, widen address register to correct size.
unsigned NewVAddr0 = AMDGPU::NoRegister;
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX10] && !IsNSA &&
- AddrSize != Info->VAddrDwords) {
+ if (isGFX10Plus() && !IsNSA && AddrSize != Info->VAddrDwords) {
unsigned VAddr0 = MI.getOperand(VAddr0Idx).getReg();
unsigned VAddrSub0 = MRI.getSubReg(VAddr0, AMDGPU::sub0);
VAddr0 = (VAddrSub0 != 0) ? VAddrSub0 : VAddr0;
@@ -844,11 +908,84 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
return MCDisassembler::Success;
}
+// Opsel and neg bits are used in src_modifiers and standalone operands. Autogen
+// decoder only adds to src_modifiers, so manually add the bits to the other
+// operands.
+DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
+ unsigned Opc = MI.getOpcode();
+ unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in);
+
+ const int ModOps[] = {AMDGPU::OpName::src0_modifiers,
+ AMDGPU::OpName::src1_modifiers,
+ AMDGPU::OpName::src2_modifiers};
+ unsigned OpSel = 0;
+ unsigned OpSelHi = 0;
+ unsigned NegLo = 0;
+ unsigned NegHi = 0;
+ for (int J = 0; J < 3; ++J) {
+ int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+ if (OpIdx == -1)
+ break;
+ unsigned Val = MI.getOperand(OpIdx).getImm();
+
+ OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J;
+ OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J;
+ NegLo |= !!(Val & SISrcMods::NEG) << J;
+ NegHi |= !!(Val & SISrcMods::NEG_HI) << J;
+ }
+
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(OpSel),
+ AMDGPU::OpName::op_sel);
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(OpSelHi),
+ AMDGPU::OpName::op_sel_hi);
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(NegLo),
+ AMDGPU::OpName::neg_lo);
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(NegHi),
+ AMDGPU::OpName::neg_hi);
+
+ return MCDisassembler::Success;
+}
+
+// Create dummy old operand and insert optional operands
+DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
+ unsigned Opc = MI.getOpcode();
+ unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::old) != -1)
+ insertNamedMCOperand(MI, MCOperand::createReg(0), AMDGPU::OpName::old);
+
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src0_modifiers);
+
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src1_modifiers);
+ return MCDisassembler::Success;
+}
+
DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
int ImmLitIdx) const {
assert(HasLiteral && "Should have decoded a literal");
const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
unsigned DescNumOps = Desc.getNumOperands();
+ insertNamedMCOperand(MI, MCOperand::createImm(Literal),
+ AMDGPU::OpName::immDeferred);
assert(DescNumOps == MI.getNumOperands());
for (unsigned I = 0; I < DescNumOps; ++I) {
auto &Op = MI.getOperand(I);
@@ -1001,6 +1138,22 @@ MCOperand AMDGPUDisassembler::decodeOperand_AV_64(unsigned Val) const {
return decodeSrcOp(OPW64, Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_AV_128(unsigned Val) const {
+ return decodeSrcOp(OPW128, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AVDst_128(unsigned Val) const {
+ using namespace AMDGPU::EncValues;
+ assert((Val & IS_VGPR) == 0); // Val{8} is not encoded but assumed to be 1.
+ return decodeSrcOp(OPW128, Val | IS_VGPR);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AVDst_512(unsigned Val) const {
+ using namespace AMDGPU::EncValues;
+ assert((Val & IS_VGPR) == 0); // Val{8} is not encoded but assumed to be 1.
+ return decodeSrcOp(OPW512, Val | IS_VGPR);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const {
return createRegOperand(AMDGPU::VReg_64RegClassID, Val);
}
@@ -1075,6 +1228,9 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const {
MCOperand
AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
if (HasLiteral) {
+ assert(
+ AMDGPU::hasVOPD(STI) &&
+ "Should only decode multiple kimm with VOPD, check VSrc operand types");
if (Literal != Val)
return errOperand(Val, "More than one unique literal is illegal");
}
@@ -1367,6 +1523,20 @@ MCOperand AMDGPUDisassembler::decodeDstOp(const OpWidthTy Width, unsigned Val) c
llvm_unreachable("unknown dst register");
}
+// Bit 0 of DstY isn't stored in the instruction, because it's always the
+// opposite of bit 0 of DstX.
+MCOperand AMDGPUDisassembler::decodeVOPDDstYOp(MCInst &Inst,
+ unsigned Val) const {
+ int VDstXInd =
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdstX);
+ assert(VDstXInd != -1);
+ assert(Inst.getOperand(VDstXInd).isReg());
+ unsigned XDstReg = MRI.getEncodingValue(Inst.getOperand(VDstXInd).getReg());
+ Val |= ~XDstReg & 1;
+ auto Width = llvm::AMDGPUDisassembler::OPW32;
+ return createRegOperand(getVgprClassId(Width), Val);
+}
+
MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
using namespace AMDGPU;
@@ -1381,8 +1551,10 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
case 109: return createRegOperand(TBA_HI);
case 110: return createRegOperand(TMA_LO);
case 111: return createRegOperand(TMA_HI);
- case 124: return createRegOperand(M0);
- case 125: return createRegOperand(SGPR_NULL);
+ case 124:
+ return isGFX11Plus() ? createRegOperand(SGPR_NULL) : createRegOperand(M0);
+ case 125:
+ return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL);
case 126: return createRegOperand(EXEC_LO);
case 127: return createRegOperand(EXEC_HI);
case 235: return createRegOperand(SRC_SHARED_BASE);
@@ -1408,7 +1580,14 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
case 106: return createRegOperand(VCC);
case 108: return createRegOperand(TBA);
case 110: return createRegOperand(TMA);
- case 125: return createRegOperand(SGPR_NULL);
+ case 124:
+ if (isGFX11Plus())
+ return createRegOperand(SGPR_NULL);
+ break;
+ case 125:
+ if (!isGFX11Plus())
+ return createRegOperand(SGPR_NULL);
+ break;
case 126: return createRegOperand(EXEC);
case 235: return createRegOperand(SRC_SHARED_BASE);
case 236: return createRegOperand(SRC_SHARED_LIMIT);
@@ -1522,6 +1701,15 @@ bool AMDGPUDisassembler::isGFX10Plus() const {
return AMDGPU::isGFX10Plus(STI);
}
+bool AMDGPUDisassembler::isGFX11() const {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX11];
+}
+
+bool AMDGPUDisassembler::isGFX11Plus() const {
+ return AMDGPU::isGFX11Plus(STI);
+}
+
+
bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
}
@@ -1888,10 +2076,10 @@ AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
//===----------------------------------------------------------------------===//
// Try to find symbol name for specified label
-bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
- raw_ostream &/*cStream*/, int64_t Value,
- uint64_t /*Address*/, bool IsBranch,
- uint64_t /*Offset*/, uint64_t /*InstSize*/) {
+bool AMDGPUSymbolizer::tryAddingSymbolicOperand(
+ MCInst &Inst, raw_ostream & /*cStream*/, int64_t Value,
+ uint64_t /*Address*/, bool IsBranch, uint64_t /*Offset*/,
+ uint64_t /*OpSize*/, uint64_t /*InstSize*/) {
if (!IsBranch) {
return false;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index eea6074d5281..31869f0917ae 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -15,8 +15,10 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
#define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
+#include "llvm/ADT/APInt.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCInst.h"
#include "llvm/Support/DataExtractor.h"
#include <memory>
@@ -27,6 +29,60 @@ class MCOperand;
class MCSubtargetInfo;
class Twine;
+// Exposes an interface expected by autogenerated code in
+// FixedLenDecoderEmitter
+class DecoderUInt128 {
+private:
+ uint64_t Lo = 0;
+ uint64_t Hi = 0;
+
+public:
+ DecoderUInt128() = default;
+ DecoderUInt128(uint64_t Lo, uint64_t Hi = 0) : Lo(Lo), Hi(Hi) {}
+ operator bool() const { return Lo || Hi; }
+ void insertBits(uint64_t SubBits, unsigned BitPosition, unsigned NumBits) {
+ assert(NumBits && NumBits <= 64);
+ assert(SubBits >> 1 >> (NumBits - 1) == 0);
+ assert(BitPosition < 128);
+ if (BitPosition < 64) {
+ Lo |= SubBits << BitPosition;
+ Hi |= SubBits >> 1 >> (63 - BitPosition);
+ } else {
+ Hi |= SubBits << (BitPosition - 64);
+ }
+ }
+ uint64_t extractBitsAsZExtValue(unsigned NumBits,
+ unsigned BitPosition) const {
+ assert(NumBits && NumBits <= 64);
+ assert(BitPosition < 128);
+ uint64_t Val;
+ if (BitPosition < 64)
+ Val = Lo >> BitPosition | Hi << 1 << (63 - BitPosition);
+ else
+ Val = Hi >> (BitPosition - 64);
+ return Val & ((uint64_t(2) << (NumBits - 1)) - 1);
+ }
+ DecoderUInt128 operator&(const DecoderUInt128 &RHS) const {
+ return DecoderUInt128(Lo & RHS.Lo, Hi & RHS.Hi);
+ }
+ DecoderUInt128 operator&(const uint64_t &RHS) const {
+ return *this & DecoderUInt128(RHS);
+ }
+ DecoderUInt128 operator~() const { return DecoderUInt128(~Lo, ~Hi); }
+ bool operator==(const DecoderUInt128 &RHS) {
+ return Lo == RHS.Lo && Hi == RHS.Hi;
+ }
+ bool operator!=(const DecoderUInt128 &RHS) {
+ return Lo != RHS.Lo || Hi != RHS.Hi;
+ }
+ bool operator!=(const int &RHS) {
+ return *this != DecoderUInt128(RHS);
+ }
+ friend raw_ostream &operator<<(raw_ostream &OS, const DecoderUInt128 &RHS) {
+ return OS << APInt(128, {RHS.Lo, RHS.Hi});
+ }
+};
+
//===----------------------------------------------------------------------===//
// AMDGPUDisassembler
//===----------------------------------------------------------------------===//
@@ -57,8 +113,21 @@ public:
MCOperand errOperand(unsigned V, const Twine& ErrMsg) const;
- DecodeStatus tryDecodeInst(const uint8_t* Table, MCInst &MI, uint64_t Inst,
- uint64_t Address) const;
+ template <typename InsnType>
+ DecodeStatus tryDecodeInst(const uint8_t *Table, MCInst &MI, InsnType Inst,
+ uint64_t Address) const {
+ assert(MI.getOpcode() == 0);
+ assert(MI.getNumOperands() == 0);
+ MCInst TmpInst;
+ HasLiteral = false;
+ const auto SavedBytes = Bytes;
+ if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) {
+ MI = TmpInst;
+ return MCDisassembler::Success;
+ }
+ Bytes = SavedBytes;
+ return MCDisassembler::Fail;
+ }
Optional<DecodeStatus> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
ArrayRef<uint8_t> Bytes,
@@ -87,10 +156,14 @@ public:
DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer,
raw_string_ostream &KdStream) const;
+ DecodeStatus convertEXPInst(MCInst &MI) const;
+ DecodeStatus convertVINTERPInst(MCInst &MI) const;
DecodeStatus convertFMAanyK(MCInst &MI, int ImmLitIdx) const;
DecodeStatus convertSDWAInst(MCInst &MI) const;
DecodeStatus convertDPP8Inst(MCInst &MI) const;
DecodeStatus convertMIMGInst(MCInst &MI) const;
+ DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
+ DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
MCOperand decodeOperand_VGPR_32(unsigned Val) const;
MCOperand decodeOperand_VRegOrLds_32(unsigned Val) const;
@@ -127,6 +200,9 @@ public:
MCOperand decodeOperand_AReg_1024(unsigned Val) const;
MCOperand decodeOperand_AV_32(unsigned Val) const;
MCOperand decodeOperand_AV_64(unsigned Val) const;
+ MCOperand decodeOperand_AV_128(unsigned Val) const;
+ MCOperand decodeOperand_AVDst_128(unsigned Val) const;
+ MCOperand decodeOperand_AVDst_512(unsigned Val) const;
enum OpWidthTy {
OPW32,
@@ -157,6 +233,7 @@ public:
MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val,
bool MandatoryLiteral = false) const;
MCOperand decodeDstOp(const OpWidthTy Width, unsigned Val) const;
+ MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const;
MCOperand decodeSpecialReg32(unsigned Val) const;
MCOperand decodeSpecialReg64(unsigned Val) const;
@@ -177,6 +254,8 @@ public:
bool isGFX9Plus() const;
bool isGFX10() const;
bool isGFX10Plus() const;
+ bool isGFX11() const;
+ bool isGFX11Plus() const;
bool hasArchitectedFlatScratch() const;
};
@@ -196,8 +275,8 @@ public:
: MCSymbolizer(Ctx, std::move(RelInfo)), DisInfo(disInfo) {}
bool tryAddingSymbolicOperand(MCInst &Inst, raw_ostream &cStream,
- int64_t Value, uint64_t Address,
- bool IsBranch, uint64_t Offset,
+ int64_t Value, uint64_t Address, bool IsBranch,
+ uint64_t Offset, uint64_t OpSize,
uint64_t InstSize) override;
void tryAddingPcLoadReferenceComment(raw_ostream &cStream,
diff --git a/llvm/lib/Target/AMDGPU/EXPInstructions.td b/llvm/lib/Target/AMDGPU/EXPInstructions.td
index b3b55ddd2c97..14ba01f0d67c 100644
--- a/llvm/lib/Target/AMDGPU/EXPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EXPInstructions.td
@@ -10,7 +10,7 @@
// EXP classes
//===----------------------------------------------------------------------===//
-class EXPCommon<bit done, string asm = ""> : InstSI<
+class EXPCommon<bit row, bit done, string asm = ""> : InstSI<
(outs),
(ins exp_tgt:$tgt,
ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
@@ -21,21 +21,30 @@ class EXPCommon<bit done, string asm = ""> : InstSI<
let mayLoad = done;
let mayStore = 1;
let UseNamedOperandTable = 1;
- let Uses = [EXEC];
+ let Uses = !if(row, [EXEC, M0], [EXEC]);
let SchedRW = [WriteExport];
let DisableWQM = 1;
}
-class EXP_Pseudo<bit done> : EXPCommon<done>,
- SIMCInstr <NAME, SIEncodingFamily.NONE> {
+class EXP_Pseudo<bit row, bit done>
+ : EXPCommon<row, done>, SIMCInstr<NAME, SIEncodingFamily.NONE> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
-class EXP_Real<bit done, string pseudo, int subtarget>
- : EXPCommon<done, "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")
- #"$compr$vm">,
- SIMCInstr <pseudo, subtarget> {
+// Real instruction with optional asm operands "compr" and "vm".
+class EXP_Real_ComprVM<bit done, string pseudo, int subtarget>
+ : EXPCommon<0, done, "exp$tgt $src0, $src1, $src2, $src3"
+ #!if(done, " done", "")#"$compr$vm">,
+ SIMCInstr<pseudo, subtarget> {
+ let AsmMatchConverter = "cvtExp";
+}
+
+// Real instruction with optional asm operand "row_en".
+class EXP_Real_Row<bit row, bit done, string pseudo, int subtarget>
+ : EXPCommon<row, done, "exp$tgt $src0, $src1, $src2, $src3"
+ #!if(done, " done", "")#!if(row, " row_en", "")>,
+ SIMCInstr<pseudo, subtarget> {
let AsmMatchConverter = "cvtExp";
}
@@ -43,17 +52,21 @@ class EXP_Real<bit done, string pseudo, int subtarget>
// EXP Instructions
//===----------------------------------------------------------------------===//
-// Split EXP instruction into EXP and EXP_DONE so we can set
-// mayLoad for done=1.
-def EXP : EXP_Pseudo<0>;
-def EXP_DONE : EXP_Pseudo<1>;
+// DONE variants have mayLoad = 1.
+// ROW variants have an implicit use of M0.
+let SubtargetPredicate = isNotGFX90APlus in {
+def EXP : EXP_Pseudo<0, 0>;
+def EXP_DONE : EXP_Pseudo<0, 1>;
+def EXP_ROW : EXP_Pseudo<1, 0>;
+def EXP_ROW_DONE : EXP_Pseudo<1, 1>;
+} // let SubtargetPredicate = isNotGFX90APlus
//===----------------------------------------------------------------------===//
// SI
//===----------------------------------------------------------------------===//
class EXP_Real_si<bit _done, string pseudo>
- : EXP_Real<_done, pseudo, SIEncodingFamily.SI>, EXPe {
+ : EXP_Real_ComprVM<_done, pseudo, SIEncodingFamily.SI>, EXPe_ComprVM {
let AssemblerPredicate = isGFX6GFX7;
let DecoderNamespace = "GFX6GFX7";
let done = _done;
@@ -67,8 +80,9 @@ def EXP_DONE_si : EXP_Real_si<1, "EXP_DONE">;
//===----------------------------------------------------------------------===//
class EXP_Real_vi<bit _done, string pseudo>
- : EXP_Real<_done, pseudo, SIEncodingFamily.VI>, EXPe_vi {
+ : EXP_Real_ComprVM<_done, pseudo, SIEncodingFamily.VI>, EXPe_vi {
let AssemblerPredicate = isGFX8GFX9;
+ let SubtargetPredicate = isNotGFX90APlus;
let DecoderNamespace = "GFX8";
let done = _done;
}
@@ -77,12 +91,12 @@ def EXP_vi : EXP_Real_vi<0, "EXP">;
def EXP_DONE_vi : EXP_Real_vi<1, "EXP_DONE">;
//===----------------------------------------------------------------------===//
-// GFX10+
+// GFX10
//===----------------------------------------------------------------------===//
class EXP_Real_gfx10<bit _done, string pseudo>
- : EXP_Real<_done, pseudo, SIEncodingFamily.GFX10>, EXPe {
- let AssemblerPredicate = isGFX10Plus;
+ : EXP_Real_ComprVM<_done, pseudo, SIEncodingFamily.GFX10>, EXPe_ComprVM {
+ let AssemblerPredicate = isGFX10Only;
let DecoderNamespace = "GFX10";
let done = _done;
}
@@ -91,6 +105,23 @@ def EXP_gfx10 : EXP_Real_gfx10<0, "EXP">;
def EXP_DONE_gfx10 : EXP_Real_gfx10<1, "EXP_DONE">;
//===----------------------------------------------------------------------===//
+// GFX11+
+//===----------------------------------------------------------------------===//
+
+class EXP_Real_gfx11<bit _row, bit _done, string pseudo>
+ : EXP_Real_Row<_row, _done, pseudo, SIEncodingFamily.GFX11>, EXPe_Row {
+ let AssemblerPredicate = isGFX11Plus;
+ let DecoderNamespace = "GFX11";
+ let row = _row;
+ let done = _done;
+}
+
+def EXP_gfx11 : EXP_Real_gfx11<0, 0, "EXP">;
+def EXP_DONE_gfx11 : EXP_Real_gfx11<0, 1, "EXP_DONE">;
+def EXP_ROW_gfx11 : EXP_Real_gfx11<1, 0, "EXP_ROW">;
+def EXP_ROW_DONE_gfx11 : EXP_Real_gfx11<1, 1, "EXP_ROW_DONE">;
+
+//===----------------------------------------------------------------------===//
// EXP Patterns
//===----------------------------------------------------------------------===//
@@ -103,6 +134,15 @@ class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en)
>;
+class ExpRowPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
+ (int_amdgcn_exp_row timm:$tgt, timm:$en,
+ (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
+ (vt ExpSrc2:$src2), (vt ExpSrc3:$src3),
+ done_val, M0),
+ (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
+ ExpSrc2:$src2, ExpSrc3:$src3, 0, 0, timm:$en)
+>;
+
class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
(int_amdgcn_exp_compr timm:$tgt, timm:$en,
(vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
@@ -119,6 +159,11 @@ def : ExpPattern<i32, EXP_DONE, -1>;
def : ExpPattern<f32, EXP, 0>;
def : ExpPattern<f32, EXP_DONE, -1>;
+def : ExpRowPattern<i32, EXP_ROW, 0>;
+def : ExpRowPattern<i32, EXP_ROW_DONE, -1>;
+def : ExpRowPattern<f32, EXP_ROW, 0>;
+def : ExpRowPattern<f32, EXP_ROW_DONE, -1>;
+
def : ExpComprPattern<v2i16, EXP, 0>;
def : ExpComprPattern<v2i16, EXP_DONE, -1>;
def : ExpComprPattern<v2f16, EXP, 0>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c530d3cb49f0..cb2822818549 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -12,6 +12,7 @@ def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [SDNPWant
def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>;
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
+def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [SDNPWantRoot], -10>;
//===----------------------------------------------------------------------===//
// FLAT classes
@@ -56,6 +57,9 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
bits<1> dlcValue = 0;
bits<1> has_sccb = 1;
bits<1> sccbValue = 0;
+ bits<1> has_sve = 0; // Scratch VGPR Enable
+ bits<1> lds = 0;
+ bits<1> sve = 0;
let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
!if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
@@ -74,8 +78,8 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
let FlatScratch = is_flat_scratch;
}
-class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
- InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
+ InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>,
Enc64 {
let isPseudo = 0;
@@ -96,6 +100,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
let IsAtomicNoRet = ps.IsAtomicNoRet;
let VM_CNT = ps.VM_CNT;
let LGKM_CNT = ps.LGKM_CNT;
+ let VALU = ps.VALU;
// encoding fields
bits<8> vaddr;
@@ -106,7 +111,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
bits<5> cpol;
// Only valid on gfx9
- bits<1> lds = 0; // XXX - What does this actually do?
+ bits<1> lds = ps.lds; // LDS DMA for global and scratch
// Segment, 00=flat, 01=scratch, 10=global, 11=reserved
bits<2> seg = !if(ps.is_flat_global, 0b10,
@@ -123,7 +128,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
// Only valid on GFX9+
let Inst{12-0} = offset;
- let Inst{13} = lds;
+ let Inst{13} = !if(ps.has_sve, ps.sve, lds);
let Inst{15-14} = seg;
let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glcValue);
@@ -240,6 +245,35 @@ multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
}
}
+class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
+ opName,
+ (outs ),
+ !con(
+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
+ (ins flat_offset:$offset, CPol_0:$cpol)),
+ " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
+ let LGKM_CNT = 1;
+ let is_flat_global = 1;
+ let lds = 1;
+ let has_data = 0;
+ let has_vdst = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_saddr = 1;
+ let enabled_saddr = EnableSaddr;
+ let VALU = 1;
+ let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+ let Uses = [M0, EXEC];
+ let SchedRW = [WriteVMEM, WriteLDS];
+}
+
+multiclass FLAT_Global_Load_LDS_Pseudo<string opName> {
+ def "" : FLAT_Global_Load_LDS_Pseudo<opName>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>,
+ GlobalSaddrTable<1, opName>;
+}
+
class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
@@ -273,16 +307,19 @@ class FlatScratchInst <string sv_op, string mode> {
class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
bit HasTiedOutput = 0,
bit EnableSaddr = 0,
- bit EnableVaddr = !not(EnableSaddr)>
+ bit EnableSVE = 0,
+ bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr))>
: FLAT_Pseudo<
opName,
(outs getLdStRegisterOperand<regClass>.ret:$vdst),
!con(
- !if(EnableSaddr,
- (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
- !if(EnableVaddr,
- (ins VGPR_32:$vaddr, flat_offset:$offset),
- (ins flat_offset:$offset))),
+ !if(EnableSVE,
+ (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
+ !if(EnableSaddr,
+ (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
+ !if(EnableVaddr,
+ (ins VGPR_32:$vaddr, flat_offset:$offset),
+ (ins flat_offset:$offset)))),
!if(HasTiedOutput, (ins CPol:$cpol, getLdStRegisterOperand<regClass>.ret:$vdst_in),
(ins CPol_0:$cpol))),
" $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
@@ -291,7 +328,9 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
let has_saddr = 1;
let enabled_saddr = EnableSaddr;
let has_vaddr = EnableVaddr;
- let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"));
+ let has_sve = EnableSVE;
+ let sve = EnableVaddr;
+ let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST")));
let maybeAtomic = 1;
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
@@ -299,15 +338,18 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
}
class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0,
- bit EnableVaddr = !not(EnableSaddr),
+ bit EnableSVE = 0,
+ bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr)),
RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret> : FLAT_Pseudo<
opName,
(outs),
- !if(EnableSaddr,
- (ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
- !if(EnableVaddr,
- (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol),
- (ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol))),
+ !if(EnableSVE,
+ (ins vdata_op:$vdata, VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
+ !if(EnableSaddr,
+ (ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
+ !if(EnableVaddr,
+ (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol),
+ (ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol)))),
" "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
let mayLoad = 0;
let mayStore = 1;
@@ -315,7 +357,9 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
let has_saddr = 1;
let enabled_saddr = EnableSaddr;
let has_vaddr = EnableVaddr;
- let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"));
+ let has_sve = EnableSVE;
+ let sve = EnableVaddr;
+ let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST")));
let maybeAtomic = 1;
}
@@ -326,8 +370,12 @@ multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit H
def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>,
FlatScratchInst<opName, "SS">;
+ let SubtargetPredicate = HasFlatScratchSVSMode in
+ def _SVS : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1, 1>,
+ FlatScratchInst<opName, "SVS">;
+
let SubtargetPredicate = HasFlatScratchSTMode in
- def _ST : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0>,
+ def _ST : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0, 0>,
FlatScratchInst<opName, "ST">;
}
}
@@ -339,12 +387,59 @@ multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> {
def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>,
FlatScratchInst<opName, "SS">;
+ let SubtargetPredicate = HasFlatScratchSVSMode in
+ def _SVS : FLAT_Scratch_Store_Pseudo<opName, regClass, 1, 1>,
+ FlatScratchInst<opName, "SVS">;
+
let SubtargetPredicate = HasFlatScratchSTMode in
- def _ST : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0>,
+ def _ST : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0, 0>,
FlatScratchInst<opName, "ST">;
}
}
+class FLAT_Scratch_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0,
+ bit EnableSVE = 0,
+ bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr))> : FLAT_Pseudo<
+ opName,
+ (outs ),
+ !if(EnableSVE,
+ (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol),
+ !if(EnableSaddr,
+ (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol),
+ !if(EnableVaddr,
+ (ins VGPR_32:$vaddr, flat_offset:$offset, CPol:$cpol),
+ (ins flat_offset:$offset, CPol:$cpol)))),
+ " "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
+
+ let LGKM_CNT = 1;
+ let is_flat_scratch = 1;
+ let lds = 1;
+ let has_data = 0;
+ let has_vdst = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_saddr = 1;
+ let enabled_saddr = EnableSaddr;
+ let has_vaddr = EnableVaddr;
+ let has_sve = EnableSVE;
+ let sve = EnableVaddr;
+ let VALU = 1;
+ let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST")));
+ let Uses = [M0, EXEC];
+ let SchedRW = [WriteVMEM, WriteLDS];
+}
+
+multiclass FLAT_Scratch_Load_LDS_Pseudo<string opName> {
+ def "" : FLAT_Scratch_Load_LDS_Pseudo<opName>,
+ FlatScratchInst<opName, "SV">;
+ def _SADDR : FLAT_Scratch_Load_LDS_Pseudo<opName, 1>,
+ FlatScratchInst<opName, "SS">;
+ def _SVS : FLAT_Scratch_Load_LDS_Pseudo<opName, 1, 1>,
+ FlatScratchInst<opName, "SVS">;
+ def _ST : FLAT_Scratch_Load_LDS_Pseudo<opName, 0, 0, 0>,
+ FlatScratchInst<opName, "ST">;
+}
+
class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins,
string asm, list<dag> pattern = []> :
FLAT_Pseudo<opName, outs, ins, asm, pattern> {
@@ -375,7 +470,6 @@ multiclass FLAT_Atomic_Pseudo<
string opName,
RegisterClass vdst_rc,
ValueType vt,
- SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
bit isFP = isFloatType<data_vt>.ret,
@@ -394,11 +488,9 @@ multiclass FLAT_Atomic_Pseudo<
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs getLdStRegisterOperand<vdst_rc>.ret:$vdst),
(ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
- " $vdst, $vaddr, $vdata$offset$cpol",
- [(set vt:$vdst,
- (atomic (FlatOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
- GlobalSaddrTable<0, opName#"_rtn">,
- AtomicNoRet <opName, 1>{
+ " $vdst, $vaddr, $vdata$offset$cpol">,
+ GlobalSaddrTable<0, opName#"_rtn">,
+ AtomicNoRet <opName, 1> {
let FPAtomic = isFP;
let AddedComplexity = -1; // Prefer global atomics if available
}
@@ -441,7 +533,6 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
string opName,
RegisterClass vdst_rc,
ValueType vt,
- SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
bit isFP = isFloatType<data_vt>.ret,
@@ -451,11 +542,9 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_op:$vdst),
(ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
- " $vdst, $vaddr, $vdata, off$offset$cpol",
- [(set vt:$vdst,
- (atomic (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
- GlobalSaddrTable<0, opName#"_rtn">,
- AtomicNoRet <opName, 1> {
+ " $vdst, $vaddr, $vdata, off$offset$cpol">,
+ GlobalSaddrTable<0, opName#"_rtn">,
+ AtomicNoRet <opName, 1> {
let has_saddr = 1;
let FPAtomic = isFP;
}
@@ -477,12 +566,11 @@ multiclass FLAT_Global_Atomic_Pseudo<
string opName,
RegisterClass vdst_rc,
ValueType vt,
- SDPatternOperator atomic_rtn = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc> {
let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
- defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic_rtn, data_vt, data_rc>;
+ defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
}
}
@@ -519,99 +607,88 @@ def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR
}
defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
- VGPR_32, i32, AMDGPUatomic_cmp_swap_flat_32,
- v2i32, VReg_64>;
+ VGPR_32, i32, v2i32, VReg_64>;
defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap_x2",
- VReg_64, i64, AMDGPUatomic_cmp_swap_flat_64,
- v2i64, VReg_128>;
+ VReg_64, i64, v2i64, VReg_128>;
defm FLAT_ATOMIC_SWAP : FLAT_Atomic_Pseudo <"flat_atomic_swap",
- VGPR_32, i32, atomic_swap_flat_32>;
+ VGPR_32, i32>;
defm FLAT_ATOMIC_SWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_swap_x2",
- VReg_64, i64, atomic_swap_flat_64>;
+ VReg_64, i64>;
defm FLAT_ATOMIC_ADD : FLAT_Atomic_Pseudo <"flat_atomic_add",
- VGPR_32, i32, atomic_load_add_flat_32>;
+ VGPR_32, i32>;
defm FLAT_ATOMIC_SUB : FLAT_Atomic_Pseudo <"flat_atomic_sub",
- VGPR_32, i32, atomic_load_sub_flat_32>;
+ VGPR_32, i32>;
defm FLAT_ATOMIC_SMIN : FLAT_Atomic_Pseudo <"flat_atomic_smin",
- VGPR_32, i32, atomic_load_min_flat_32>;
+ VGPR_32, i32>;
defm FLAT_ATOMIC_UMIN : FLAT_Atomic_Pseudo <"flat_atomic_umin",
- VGPR_32, i32, atomic_load_umin_flat_32>;
+ VGPR_32, i32>;
defm FLAT_ATOMIC_SMAX : FLAT_Atomic_Pseudo <"flat_atomic_smax",
- VGPR_32, i32, atomic_load_max_flat_32>;
+ VGPR_32, i32>;
defm FLAT_ATOMIC_UMAX : FLAT_Atomic_Pseudo <"flat_atomic_umax",
- VGPR_32, i32, atomic_load_umax_flat_32>;
+ VGPR_32, i32>;
defm FLAT_ATOMIC_AND : FLAT_Atomic_Pseudo <"flat_atomic_and",
- VGPR_32, i32, atomic_load_and_flat_32>;
+ VGPR_32, i32>;
defm FLAT_ATOMIC_OR : FLAT_Atomic_Pseudo <"flat_atomic_or",
- VGPR_32, i32, atomic_load_or_flat_32>;
+ VGPR_32, i32>;
defm FLAT_ATOMIC_XOR : FLAT_Atomic_Pseudo <"flat_atomic_xor",
- VGPR_32, i32, atomic_load_xor_flat_32>;
+ VGPR_32, i32>;
defm FLAT_ATOMIC_INC : FLAT_Atomic_Pseudo <"flat_atomic_inc",
- VGPR_32, i32, atomic_inc_flat_32>;
+ VGPR_32, i32>;
defm FLAT_ATOMIC_DEC : FLAT_Atomic_Pseudo <"flat_atomic_dec",
- VGPR_32, i32, atomic_dec_flat_32>;
+ VGPR_32, i32>;
defm FLAT_ATOMIC_ADD_X2 : FLAT_Atomic_Pseudo <"flat_atomic_add_x2",
- VReg_64, i64, atomic_load_add_flat_64>;
+ VReg_64, i64>;
defm FLAT_ATOMIC_SUB_X2 : FLAT_Atomic_Pseudo <"flat_atomic_sub_x2",
- VReg_64, i64, atomic_load_sub_flat_64>;
+ VReg_64, i64>;
defm FLAT_ATOMIC_SMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smin_x2",
- VReg_64, i64, atomic_load_min_flat_64>;
+ VReg_64, i64>;
defm FLAT_ATOMIC_UMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umin_x2",
- VReg_64, i64, atomic_load_umin_flat_64>;
+ VReg_64, i64>;
defm FLAT_ATOMIC_SMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smax_x2",
- VReg_64, i64, atomic_load_max_flat_64>;
+ VReg_64, i64>;
defm FLAT_ATOMIC_UMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umax_x2",
- VReg_64, i64, atomic_load_umax_flat_64>;
+ VReg_64, i64>;
defm FLAT_ATOMIC_AND_X2 : FLAT_Atomic_Pseudo <"flat_atomic_and_x2",
- VReg_64, i64, atomic_load_and_flat_64>;
+ VReg_64, i64>;
defm FLAT_ATOMIC_OR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_or_x2",
- VReg_64, i64, atomic_load_or_flat_64>;
+ VReg_64, i64>;
defm FLAT_ATOMIC_XOR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_xor_x2",
- VReg_64, i64, atomic_load_xor_flat_64>;
+ VReg_64, i64>;
defm FLAT_ATOMIC_INC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2",
- VReg_64, i64, atomic_inc_flat_64>;
+ VReg_64, i64>;
defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2",
- VReg_64, i64, atomic_dec_flat_64>;
+ VReg_64, i64>;
// GFX7-, GFX10-only flat instructions.
let SubtargetPredicate = isGFX7GFX10 in {
-defm FLAT_ATOMIC_FCMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap",
- VGPR_32, f32, null_frag, v2f32, VReg_64>;
-
defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2",
- VReg_64, f64, null_frag, v2f64, VReg_128>;
-
-defm FLAT_ATOMIC_FMIN : FLAT_Atomic_Pseudo <"flat_atomic_fmin",
- VGPR_32, f32>;
-
-defm FLAT_ATOMIC_FMAX : FLAT_Atomic_Pseudo <"flat_atomic_fmax",
- VGPR_32, f32>;
+ VReg_64, f64, v2f64, VReg_128>;
defm FLAT_ATOMIC_FMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2",
VReg_64, f64>;
@@ -622,14 +699,39 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
} // End SubtargetPredicate = isGFX7GFX10
let SubtargetPredicate = isGFX90APlus in {
- defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64, int_amdgcn_flat_atomic_fadd>;
- defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmin>;
- defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmax>;
- defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
- defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
- defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+ defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>;
+ defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>;
+ defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>;
+ defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>;
+ defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
+ defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
} // End SubtargetPredicate = isGFX90APlus
+let SubtargetPredicate = isGFX940Plus in {
+ defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16>;
+ defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>;
+ defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>;
+} // End SubtargetPredicate = isGFX940Plus
+
+// GFX7-, GFX10-, GFX11-only flat instructions.
+let SubtargetPredicate = isGFX7GFX10GFX11 in {
+
+defm FLAT_ATOMIC_FCMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap",
+ VGPR_32, f32, v2f32, VReg_64>;
+
+defm FLAT_ATOMIC_FMIN : FLAT_Atomic_Pseudo <"flat_atomic_fmin",
+ VGPR_32, f32>;
+
+defm FLAT_ATOMIC_FMAX : FLAT_Atomic_Pseudo <"flat_atomic_fmax",
+ VGPR_32, f32>;
+
+} // End SubtargetPredicate = isGFX7GFX10GFX11
+
+// GFX940-, GFX11-only flat instructions.
+let SubtargetPredicate = isGFX940GFX11Plus in {
+ defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>;
+} // End SubtargetPredicate = isGFX940GFX11Plus
+
defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
@@ -662,88 +764,93 @@ defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d
let is_flat_global = 1 in {
defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap",
- VGPR_32, i32, AMDGPUatomic_cmp_swap_global_32,
- v2i32, VReg_64>;
+ VGPR_32, i32, v2i32, VReg_64>;
defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap_x2",
- VReg_64, i64, AMDGPUatomic_cmp_swap_global_64,
- v2i64, VReg_128>;
+ VReg_64, i64, v2i64, VReg_128>;
defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap",
- VGPR_32, i32, atomic_swap_global_32>;
+ VGPR_32, i32>;
defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_swap_x2",
- VReg_64, i64, atomic_swap_global_64>;
+ VReg_64, i64>;
defm GLOBAL_ATOMIC_ADD : FLAT_Global_Atomic_Pseudo <"global_atomic_add",
- VGPR_32, i32, atomic_load_add_global_32>;
+ VGPR_32, i32>;
defm GLOBAL_ATOMIC_SUB : FLAT_Global_Atomic_Pseudo <"global_atomic_sub",
- VGPR_32, i32, atomic_load_sub_global_32>;
+ VGPR_32, i32>;
defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_smin",
- VGPR_32, i32, atomic_load_min_global_32>;
+ VGPR_32, i32>;
defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_umin",
- VGPR_32, i32, atomic_load_umin_global_32>;
+ VGPR_32, i32>;
defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_smax",
- VGPR_32, i32, atomic_load_max_global_32>;
+ VGPR_32, i32>;
defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_umax",
- VGPR_32, i32, atomic_load_umax_global_32>;
+ VGPR_32, i32>;
defm GLOBAL_ATOMIC_AND : FLAT_Global_Atomic_Pseudo <"global_atomic_and",
- VGPR_32, i32, atomic_load_and_global_32>;
+ VGPR_32, i32>;
defm GLOBAL_ATOMIC_OR : FLAT_Global_Atomic_Pseudo <"global_atomic_or",
- VGPR_32, i32, atomic_load_or_global_32>;
+ VGPR_32, i32>;
defm GLOBAL_ATOMIC_XOR : FLAT_Global_Atomic_Pseudo <"global_atomic_xor",
- VGPR_32, i32, atomic_load_xor_global_32>;
+ VGPR_32, i32>;
defm GLOBAL_ATOMIC_INC : FLAT_Global_Atomic_Pseudo <"global_atomic_inc",
- VGPR_32, i32, atomic_inc_global_32>;
+ VGPR_32, i32>;
defm GLOBAL_ATOMIC_DEC : FLAT_Global_Atomic_Pseudo <"global_atomic_dec",
- VGPR_32, i32, atomic_dec_global_32>;
+ VGPR_32, i32>;
defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_add_x2",
- VReg_64, i64, atomic_load_add_global_64>;
+ VReg_64, i64>;
defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_sub_x2",
- VReg_64, i64, atomic_load_sub_global_64>;
+ VReg_64, i64>;
defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smin_x2",
- VReg_64, i64, atomic_load_min_global_64>;
+ VReg_64, i64>;
defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umin_x2",
- VReg_64, i64, atomic_load_umin_global_64>;
+ VReg_64, i64>;
defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smax_x2",
- VReg_64, i64, atomic_load_max_global_64>;
+ VReg_64, i64>;
defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umax_x2",
- VReg_64, i64, atomic_load_umax_global_64>;
+ VReg_64, i64>;
defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_and_x2",
- VReg_64, i64, atomic_load_and_global_64>;
+ VReg_64, i64>;
defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_or_x2",
- VReg_64, i64, atomic_load_or_global_64>;
+ VReg_64, i64>;
defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_xor_x2",
- VReg_64, i64, atomic_load_xor_global_64>;
+ VReg_64, i64>;
defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2",
- VReg_64, i64, atomic_inc_global_64>;
+ VReg_64, i64>;
defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
- VReg_64, i64, atomic_dec_global_64>;
+ VReg_64, i64>;
let SubtargetPredicate = HasGFX10_BEncoding in
defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub",
- VGPR_32, i32, int_amdgcn_global_atomic_csub>;
+ VGPR_32, i32>;
+
+defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte">;
+defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte">;
+defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ushort">;
+defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">;
+defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;
+
} // End is_flat_global = 1
@@ -775,41 +882,46 @@ defm SCRATCH_STORE_DWORDX4 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx4",
defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi", VGPR_32>;
defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi", VGPR_32>;
+defm SCRATCH_LOAD_LDS_UBYTE : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_ubyte">;
+defm SCRATCH_LOAD_LDS_SBYTE : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sbyte">;
+defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_ushort">;
+defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">;
+defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">;
+
} // End SubtargetPredicate = HasFlatScratchInsts
let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
defm GLOBAL_ATOMIC_FCMPSWAP :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, null_frag, v2f32, VReg_64>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>;
defm GLOBAL_ATOMIC_FMIN :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32, int_amdgcn_global_atomic_fmin>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>;
defm GLOBAL_ATOMIC_FMAX :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32, int_amdgcn_global_atomic_fmax>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>;
defm GLOBAL_ATOMIC_FCMPSWAP_X2 :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, null_frag, v2f64, VReg_128>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, v2f64, VReg_128>;
defm GLOBAL_ATOMIC_FMIN_X2 :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>;
defm GLOBAL_ATOMIC_FMAX_X2 :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
let is_flat_global = 1 in {
-let OtherPredicates = [HasAtomicFaddInsts] in {
+let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
"global_atomic_add_f32", VGPR_32, f32
>;
+let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
"global_atomic_pk_add_f16", VGPR_32, v2f16
>;
-} // End OtherPredicates = [HasAtomicFaddInsts]
-
-let OtherPredicates = [isGFX90APlus] in {
+let OtherPredicates = [HasAtomicFaddRtnInsts] in
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
- "global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd
+ "global_atomic_add_f32", VGPR_32, f32
>;
+let OtherPredicates = [isGFX90APlus] in
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
- "global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd
+ "global_atomic_pk_add_f16", VGPR_32, v2f16
>;
-} // End OtherPredicates = [isGFX90APlus]
} // End is_flat_global = 1
//===----------------------------------------------------------------------===//
@@ -896,24 +1008,47 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
(inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
-class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
- ValueType data_vt = vt> : GCNPat <
- (vt (node (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
- (inst $vaddr, $data, $offset)
->;
-
class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (FlatOffset i64:$vaddr, i16:$offset), vt:$data),
(inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
+multiclass FlatAtomicPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt> {
+ defvar rtnNode = !cast<PatFrags>(node#"_ret_"#vt.Size);
+ defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size);
+
+ def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+ (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+
+ def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+ (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+}
+
+multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt, bit isIntr = 0> {
+ defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
+ defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
+
+ def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+ (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+
+ def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+ (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+}
+
+multiclass FlatSignedAtomicIntrPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt> {
+ defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+}
+
class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data),
(inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
-class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
- ValueType data_vt = vt> : GCNPat <
+class FlatSignedAtomicPatRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
+ ValueType data_vt = vt> : GCNPat <
(vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
@@ -949,8 +1084,28 @@ class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
(inst getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
>;
+class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset))),
+ (inst $vaddr, $saddr, $offset, 0)
+>;
+
+class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+ ValueType vt> : GCNPat <
+ (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset)),
+ (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset)
+>;
+
+class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset), vt:$in)),
+ (inst $vaddr, $saddr, $offset, 0, $in)
+>;
+
let OtherPredicates = [HasFlatAddressSpace] in {
+def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, i16>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_16_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_16_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
@@ -986,44 +1141,52 @@ def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>;
def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
}
-def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>;
-def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
+def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
+def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global_32, i32, v2i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>;
+foreach as = [ "flat", "global" ] in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SUB", "atomic_load_sub_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_INC", "atomic_inc_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_DEC", "atomic_dec_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_AND", "atomic_load_and_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SMAX", "atomic_load_max_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_UMAX", "atomic_load_umax_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SMIN", "atomic_load_min_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_UMIN", "atomic_load_umin_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_OR", "atomic_load_or_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SWAP", "atomic_swap_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_"#as, i32, v2i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_XOR", "atomic_load_xor_"#as, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global_64, i64, v2i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_X2", "atomic_load_add_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SUB_X2", "atomic_load_sub_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_INC_X2", "atomic_inc_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_DEC_X2", "atomic_dec_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_AND_X2", "atomic_load_and_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SMAX_X2", "atomic_load_max_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_UMAX_X2", "atomic_load_umax_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SMIN_X2", "atomic_load_min_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_UMIN_X2", "atomic_load_umin_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_OR_X2", "atomic_load_or_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_SWAP_X2", "atomic_swap_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_"#as, i64, v2i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>;
+} // end foreach as
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
-let OtherPredicates = [D16PreservesUnusedBits] in {
+let OtherPredicates = [HasD16LoadStore] in {
def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
+}
+let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
@@ -1084,9 +1247,9 @@ multiclass GlobalFLATAtomicStorePats<FLAT_Pseudo inst, SDPatternOperator node, V
}
}
-multiclass GlobalFLATAtomicPats<string nortn_inst_name, SDPatternOperator node,
- ValueType vt, ValueType data_vt = vt> {
- def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
+multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator node,
+ ValueType vt, ValueType data_vt = vt> {
+ def : FlatSignedAtomicPatRtn <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
let AddedComplexity = 10;
}
@@ -1095,6 +1258,26 @@ multiclass GlobalFLATAtomicPats<string nortn_inst_name, SDPatternOperator node,
}
}
+multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
+ ValueType data_vt = vt, bit isIntr = 0> {
+ defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
+ defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
+
+ let AddedComplexity = 10 in {
+ defm : FlatSignedAtomicPat <inst, node, vt, data_vt, isIntr>;
+ }
+
+ let AddedComplexity = 11 in {
+ def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), noRtnNode, vt, data_vt>;
+ def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>;
+ }
+}
+
+multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt,
+ ValueType data_vt = vt> {
+ defm : GlobalFLATAtomicPats<inst, node, vt, data_vt, /* isIntr */ 1>;
+}
+
multiclass GlobalFLATNoRtnAtomicPats<FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> {
def : FlatSignedAtomicPatNoRtn <inst, node, vt> {
@@ -1114,6 +1297,11 @@ multiclass ScratchFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTy
def : ScratchLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 26;
}
+
+ def : ScratchLoadSVaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> {
+ let SubtargetPredicate = HasFlatScratchSVSMode;
+ let AddedComplexity = 27;
+ }
}
multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
@@ -1125,6 +1313,11 @@ multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
def : ScratchStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 26;
}
+
+ def : ScratchStoreSVaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> {
+ let SubtargetPredicate = HasFlatScratchSVSMode;
+ let AddedComplexity = 27;
+ }
}
multiclass ScratchFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
@@ -1135,10 +1328,19 @@ multiclass ScratchFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Val
def : ScratchLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 26;
}
+
+ def : ScratchLoadSVaddrPat_D16 <!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> {
+ let SubtargetPredicate = HasFlatScratchSVSMode;
+ let AddedComplexity = 27;
+ }
}
let OtherPredicates = [HasFlatGlobalInsts] in {
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, i16>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_16_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_16_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>;
@@ -1179,10 +1381,12 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, i16>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX3, store_global, v3i32>;
-let OtherPredicates = [D16PreservesUnusedBits] in {
+let OtherPredicates = [HasD16LoadStore] in {
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
+}
+let OtherPredicates = [D16PreservesUnusedBits] in {
defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>;
defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>;
defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>;
@@ -1198,59 +1402,84 @@ defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>
defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
}
-defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORD, atomic_store_global_32, i32>;
-defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_global_64, i64>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", atomic_load_add_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", atomic_load_sub_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", atomic_inc_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", atomic_dec_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND", atomic_load_and_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX", atomic_load_max_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX", atomic_load_umax_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN", atomic_load_min_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN", atomic_load_umin_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", atomic_load_or_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", atomic_swap_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", AMDGPUatomic_cmp_swap_global_32, i32, v2i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", atomic_load_xor_global_32, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CSUB", int_amdgcn_global_atomic_csub, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", "atomic_inc_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", "atomic_dec_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND", "atomic_load_and_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX", "atomic_load_max_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX", "atomic_load_umax_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN", "atomic_load_min_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN", "atomic_load_umin_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
+defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", int_amdgcn_global_atomic_csub, i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", atomic_load_add_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", atomic_load_sub_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", atomic_inc_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", atomic_dec_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND_X2", atomic_load_and_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX_X2", atomic_load_max_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX_X2", atomic_load_umax_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN_X2", atomic_load_min_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN_X2", atomic_load_umin_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR_X2", atomic_load_or_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", atomic_swap_global_64, i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", AMDGPUatomic_cmp_swap_global_64, i64, v2i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", atomic_load_xor_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", "atomic_inc_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", "atomic_dec_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND_X2", "atomic_load_and_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX_X2", "atomic_load_max_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX_X2", "atomic_load_umax_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN_X2", "atomic_load_min_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN_X2", "atomic_load_umin_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR_X2", "atomic_load_or_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", "atomic_swap_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_global", i64, v2i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
let OtherPredicates = [isGFX10Plus] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", atomic_load_fmin_global_32, f32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", atomic_load_fmax_global_32, f32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", atomic_load_fmin_global_64, f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", atomic_load_fmax_global_64, f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>;
}
-let OtherPredicates = [HasAtomicFaddInsts] in {
+let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_ADD_F32, atomic_load_fadd_global_noret_32, f32>;
+let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in
defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_load_fadd_v2f16_global_noret_32, v2f16>;
-}
let OtherPredicates = [isGFX90APlus] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", atomic_load_fadd_global_32, f32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", atomic_load_fadd_v2f16_global_32, v2f16>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", atomic_load_fadd_global_64, f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", atomic_load_fmin_global_64, f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", atomic_load_fmax_global_64, f64>;
-def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F64_RTN, atomic_load_fadd_flat_64, f64>;
-def : FlatSignedAtomicPat <FLAT_ATOMIC_MIN_F64_RTN, atomic_load_fmin_flat_64, f64>;
-def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64_RTN, atomic_load_fmax_flat_64, f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_global", v2f16>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
+}
+
+let OtherPredicates = [isGFX940Plus] in {
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_flat", v2f16>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
}
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
@@ -1291,10 +1520,12 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, truncstorei16_private, i32>;
defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>;
defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX3, store_private, v3i32>;
-let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in {
+let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, EnableFlatScratch] in {
defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT_D16_HI, truncstorei16_hi16_private, i32>;
defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE_D16_HI, truncstorei8_hi16_private, i32>;
+}
+let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in {
defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2i16>;
defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2f16>;
defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_private, v2i16>;
@@ -1405,6 +1636,57 @@ multiclass FLAT_Real_AllAddr_vi<bits<7> op,
def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
}
+class FLAT_Real_gfx940 <bits<7> op, FLAT_Pseudo ps> :
+ FLAT_Real <op, ps>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX940> {
+ let AssemblerPredicate = isGFX940Plus;
+ let DecoderNamespace = "GFX9";
+ let Inst{13} = ps.sve;
+ let Inst{25} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccbValue);
+}
+
+multiclass FLAT_Real_AllAddr_SVE_vi<bits<7> op> {
+ def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)> {
+ let AssemblerPredicate = isGFX8GFX9NotGFX940;
+ let OtherPredicates = [isGFX8GFX9NotGFX940];
+ }
+ def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")> {
+ let DecoderNamespace = "GFX9";
+ }
+ let AssemblerPredicate = isGFX940Plus, SubtargetPredicate = isGFX940Plus in {
+ def _VE_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
+ def _SVS_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SVS")>;
+ def _ST_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
+ }
+}
+
+multiclass FLAT_Real_AllAddr_LDS<bits<7> op, bits<7> pre_gfx940_op,
+ string pre_gfx940_name = !subst("_lds", "", !cast<FLAT_Pseudo>(NAME).PseudoInstr),
+ bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+
+ let OtherPredicates = [isGFX8GFX9NotGFX940] in {
+ def _vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb> {
+ let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds";
+ }
+ def _SADDR_vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb> {
+ let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds";
+ }
+ }
+
+ let SubtargetPredicate = isGFX940Plus in {
+ def _gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
+ def _SADDR_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+ }
+}
+
+multiclass FLAT_Real_AllAddr_SVE_LDS<bits<7> op, bits<7> pre_gfx940_op> {
+ defm "" : FLAT_Real_AllAddr_LDS<op, pre_gfx940_op>;
+ let SubtargetPredicate = isGFX940Plus in {
+ def _SVS_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SVS")>;
+ def _ST_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
+ }
+}
+
def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
@@ -1496,6 +1778,11 @@ defm GLOBAL_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>;
defm GLOBAL_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>;
defm GLOBAL_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>;
+defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Real_AllAddr_LDS <0x026, 0x10>;
+defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Real_AllAddr_LDS <0x027, 0x11>;
+defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS <0x028, 0x12>;
+defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>;
+defm GLOBAL_LOAD_LDS_DWORD : FLAT_Real_AllAddr_LDS <0x02a, 0x14>;
defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Real_Atomics_vi <0x40>;
defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Real_Atomics_vi <0x41>;
@@ -1524,32 +1811,39 @@ defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Real_Atomics_vi <0x6a>;
defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Real_Atomics_vi <0x6b>;
defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Real_Atomics_vi <0x6c>;
-defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_vi <0x10>;
-defm SCRATCH_LOAD_SBYTE : FLAT_Real_AllAddr_vi <0x11>;
-defm SCRATCH_LOAD_USHORT : FLAT_Real_AllAddr_vi <0x12>;
-defm SCRATCH_LOAD_SSHORT : FLAT_Real_AllAddr_vi <0x13>;
-defm SCRATCH_LOAD_DWORD : FLAT_Real_AllAddr_vi <0x14>;
-defm SCRATCH_LOAD_DWORDX2 : FLAT_Real_AllAddr_vi <0x15>;
-defm SCRATCH_LOAD_DWORDX3 : FLAT_Real_AllAddr_vi <0x16>;
-defm SCRATCH_LOAD_DWORDX4 : FLAT_Real_AllAddr_vi <0x17>;
-defm SCRATCH_STORE_BYTE : FLAT_Real_AllAddr_vi <0x18>;
-defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_vi <0x19>;
-defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_vi <0x20>;
-defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_vi <0x21>;
-defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_vi <0x22>;
-defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_vi <0x23>;
-defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_AllAddr_vi <0x24>;
-defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_vi <0x25>;
-defm SCRATCH_STORE_SHORT : FLAT_Real_AllAddr_vi <0x1a>;
-defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_vi <0x1b>;
-defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_vi <0x1c>;
-defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>;
-defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>;
-defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>;
+defm SCRATCH_LOAD_LDS_UBYTE : FLAT_Real_AllAddr_SVE_LDS <0x026, 0x10>;
+defm SCRATCH_LOAD_LDS_SBYTE : FLAT_Real_AllAddr_SVE_LDS <0x027, 0x11>;
+defm SCRATCH_LOAD_LDS_USHORT : FLAT_Real_AllAddr_SVE_LDS <0x028, 0x12>;
+defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_SVE_LDS <0x029, 0x13>;
+defm SCRATCH_LOAD_LDS_DWORD : FLAT_Real_AllAddr_SVE_LDS <0x02a, 0x14>;
+
+defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_SVE_vi <0x10>;
+defm SCRATCH_LOAD_SBYTE : FLAT_Real_AllAddr_SVE_vi <0x11>;
+defm SCRATCH_LOAD_USHORT : FLAT_Real_AllAddr_SVE_vi <0x12>;
+defm SCRATCH_LOAD_SSHORT : FLAT_Real_AllAddr_SVE_vi <0x13>;
+defm SCRATCH_LOAD_DWORD : FLAT_Real_AllAddr_SVE_vi <0x14>;
+defm SCRATCH_LOAD_DWORDX2 : FLAT_Real_AllAddr_SVE_vi <0x15>;
+defm SCRATCH_LOAD_DWORDX3 : FLAT_Real_AllAddr_SVE_vi <0x16>;
+defm SCRATCH_LOAD_DWORDX4 : FLAT_Real_AllAddr_SVE_vi <0x17>;
+defm SCRATCH_STORE_BYTE : FLAT_Real_AllAddr_SVE_vi <0x18>;
+defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_SVE_vi <0x19>;
+defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_SVE_vi <0x20>;
+defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_SVE_vi <0x21>;
+defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_SVE_vi <0x22>;
+defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_SVE_vi <0x23>;
+defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_AllAddr_SVE_vi <0x24>;
+defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_SVE_vi <0x25>;
+defm SCRATCH_STORE_SHORT : FLAT_Real_AllAddr_SVE_vi <0x1a>;
+defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_SVE_vi <0x1b>;
+defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_SVE_vi <0x1c>;
+defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_SVE_vi <0x1d>;
+defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_SVE_vi <0x1e>;
+defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_SVE_vi <0x1f>;
-let SubtargetPredicate = HasAtomicFaddInsts in {
-defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>;
-defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>;
+let SubtargetPredicate = isGFX8GFX9NotGFX940 in {
+ // These instructions are encoded differently on gfx90* and gfx940.
+ defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>;
+ defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>;
}
let SubtargetPredicate = isGFX90AOnly in {
@@ -1561,13 +1855,46 @@ let SubtargetPredicate = isGFX90AOnly in {
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_vi<0x51, 0>;
} // End SubtargetPredicate = isGFX90AOnly
+multiclass FLAT_Real_AllAddr_gfx940<bits<7> op> {
+ def _gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
+ def _SADDR_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+}
+
+multiclass FLAT_Real_Atomics_gfx940 <bits<7> op, FLAT_Pseudo ps> {
+ def _gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
+ def _RTN_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
+}
+
+multiclass FLAT_Global_Real_Atomics_gfx940<bits<7> op> :
+ FLAT_Real_AllAddr_gfx940<op> {
+ def _RTN_gfx940 : FLAT_Real_gfx940 <op, !cast<FLAT_Pseudo>(NAME#"_RTN")>;
+ def _SADDR_RTN_gfx940 : FLAT_Real_gfx940 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>;
+}
+
+let SubtargetPredicate = isGFX940Plus in {
+ // These instructions are encoded differently on gfx90* and gfx940.
+ defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_gfx940 <0x04d>;
+ defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_gfx940 <0x04e>;
+
+ defm FLAT_ATOMIC_ADD_F64 : FLAT_Real_Atomics_gfx940<0x4f, FLAT_ATOMIC_ADD_F64>;
+ defm FLAT_ATOMIC_MIN_F64 : FLAT_Real_Atomics_gfx940<0x50, FLAT_ATOMIC_MIN_F64>;
+ defm FLAT_ATOMIC_MAX_F64 : FLAT_Real_Atomics_gfx940<0x51, FLAT_ATOMIC_MAX_F64>;
+ defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_gfx940<0x4f>;
+ defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_gfx940<0x50>;
+ defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_gfx940<0x51>;
+ defm FLAT_ATOMIC_ADD_F32 : FLAT_Real_Atomics_vi<0x4d, FLAT_ATOMIC_ADD_F32>;
+ defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Real_Atomics_vi<0x4e, FLAT_ATOMIC_PK_ADD_F16>;
+ defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Real_Atomics_vi<0x52, FLAT_ATOMIC_PK_ADD_BF16>;
+ defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>;
+} // End SubtargetPredicate = isGFX940Plus
+
//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
FLAT_Real<op, ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> {
- let AssemblerPredicate = isGFX10Plus;
+ let AssemblerPredicate = isGFX10Only;
let DecoderNamespace = "GFX10";
let Inst{11-0} = offset{11-0};
@@ -1627,6 +1954,23 @@ multiclass FLAT_Real_ScratchAllAddr_gfx10<bits<7> op> :
FLAT_Real_SADDR_gfx10<op>,
FLAT_Real_ST_gfx10<op>;
+multiclass FLAT_Real_AllAddr_LDS_gfx10<bits<7> op,
+ string opname = !subst("_lds", "", !cast<FLAT_Pseudo>(NAME).PseudoInstr)> {
+ let AsmString = opname # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds" in
+ defm "" : FLAT_Real_Base_gfx10<op>;
+
+ let AsmString = opname # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds" in
+ defm "" : FLAT_Real_SADDR_gfx10<op>;
+}
+
+multiclass FLAT_Real_ScratchAllAddr_LDS_gfx10<bits<7> op,
+ string opname = !subst("_lds", "", !cast<FLAT_Pseudo>(NAME).PseudoInstr)> {
+ defm "" : FLAT_Real_AllAddr_LDS_gfx10<op>;
+
+ let AsmString = opname # !cast<FLAT_Pseudo>(NAME#"_ST").AsmOperands # " lds" in
+ defm "" : FLAT_Real_ST_gfx10<op>;
+}
+
// ENC_FLAT.
defm FLAT_LOAD_UBYTE : FLAT_Real_Base_gfx10<0x008>;
defm FLAT_LOAD_SBYTE : FLAT_Real_Base_gfx10<0x009>;
@@ -1743,6 +2087,12 @@ defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>;
defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x016>;
defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x017>;
+defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Real_AllAddr_LDS_gfx10 <0x008>;
+defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Real_AllAddr_LDS_gfx10 <0x009>;
+defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS_gfx10 <0x00a>;
+defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS_gfx10 <0x00b>;
+defm GLOBAL_LOAD_LDS_DWORD : FLAT_Real_AllAddr_LDS_gfx10 <0x00c>;
+
// ENC_FLAT_SCRATCH.
defm SCRATCH_LOAD_UBYTE : FLAT_Real_ScratchAllAddr_gfx10<0x008>;
defm SCRATCH_LOAD_SBYTE : FLAT_Real_ScratchAllAddr_gfx10<0x009>;
@@ -1766,3 +2116,219 @@ defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x022>;
defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x023>;
defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x024>;
defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x025>;
+
+defm SCRATCH_LOAD_LDS_UBYTE : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x008>;
+defm SCRATCH_LOAD_LDS_SBYTE : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x009>;
+defm SCRATCH_LOAD_LDS_USHORT : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00a>;
+defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00b>;
+defm SCRATCH_LOAD_LDS_DWORD : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00c>;
+
+//===----------------------------------------------------------------------===//
+// GFX11
+//===----------------------------------------------------------------------===//
+
+class FLAT_Real_gfx11 <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
+ FLAT_Real <op, ps, opName>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX11> {
+ let AssemblerPredicate = isGFX11Plus;
+ let DecoderNamespace = "GFX11";
+
+ let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlcValue);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glcValue);
+ let Inst{15} = cpol{CPolBit.SLC};
+ let Inst{17-16} = seg;
+ let Inst{55} = ps.sve;
+}
+
+multiclass FLAT_Real_Base_gfx11<bits<7> op, string ps, string opName, int renamed = false> {
+ def _gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps), opName> {
+ let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+ }
+ if renamed then
+ def _renamed_gfx11 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX11Plus]>;
+}
+
+multiclass FLAT_Real_RTN_gfx11<bits<7> op, string ps, string opName> {
+ def _RTN_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_RTN"), opName> {
+ let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+ }
+}
+
+multiclass FLAT_Real_SADDR_gfx11<bits<7> op, string ps, string opName> {
+ def _SADDR_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SADDR"), opName>;
+}
+
+multiclass FLAT_Real_SADDR_RTN_gfx11<bits<7> op, string ps, string opName> {
+ def _SADDR_RTN_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SADDR_RTN"), opName>;
+}
+
+multiclass FLAT_Real_ST_gfx11<bits<7> op, string ps, string opName> {
+ def _ST_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_ST"), opName> {
+ let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+ let OtherPredicates = [HasFlatScratchSTMode];
+ }
+}
+
+multiclass FLAT_Real_SVS_gfx11<bits<7> op, string ps, string opName> {
+ def _SVS_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SVS"), opName> {
+ let OtherPredicates = [HasFlatScratchSVSMode];
+ }
+}
+
+multiclass FLAT_Real_AllAddr_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
+ FLAT_Real_Base_gfx11<op, ps, opName, renamed>,
+ FLAT_Real_SADDR_gfx11<op, ps, opName>;
+
+multiclass FLAT_Real_Atomics_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
+ FLAT_Real_Base_gfx11<op, ps, opName, renamed>,
+ FLAT_Real_RTN_gfx11<op, ps, opName>;
+
+multiclass FLAT_Real_GlblAtomics_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
+ FLAT_Real_AllAddr_gfx11<op, ps, opName, renamed>,
+ FLAT_Real_RTN_gfx11<op, ps, opName>,
+ FLAT_Real_SADDR_RTN_gfx11<op, ps, opName>;
+
+multiclass FLAT_Real_GlblAtomics_RTN_gfx11<bits<7> op, string ps, string opName> :
+ FLAT_Real_RTN_gfx11<op, ps, opName>,
+ FLAT_Real_SADDR_RTN_gfx11<op, ps, opName>;
+
+multiclass FLAT_Real_ScratchAllAddr_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
+ FLAT_Real_Base_gfx11<op, ps, opName, renamed>,
+ FLAT_Real_SADDR_gfx11<op, ps, opName>,
+ FLAT_Real_ST_gfx11<op, ps, opName>,
+ FLAT_Real_SVS_gfx11<op, ps, opName>;
+
+// ENC_FLAT.
+defm FLAT_LOAD_U8 : FLAT_Real_Base_gfx11<0x010, "FLAT_LOAD_UBYTE", "flat_load_u8", true>;
+defm FLAT_LOAD_I8 : FLAT_Real_Base_gfx11<0x011, "FLAT_LOAD_SBYTE", "flat_load_i8", true>;
+defm FLAT_LOAD_U16 : FLAT_Real_Base_gfx11<0x012, "FLAT_LOAD_USHORT", "flat_load_u16", true>;
+defm FLAT_LOAD_I16 : FLAT_Real_Base_gfx11<0x013, "FLAT_LOAD_SSHORT", "flat_load_i16", true>;
+defm FLAT_LOAD_B32 : FLAT_Real_Base_gfx11<0x014, "FLAT_LOAD_DWORD", "flat_load_b32", true>;
+defm FLAT_LOAD_B64 : FLAT_Real_Base_gfx11<0x015, "FLAT_LOAD_DWORDX2", "flat_load_b64", true>;
+defm FLAT_LOAD_B96 : FLAT_Real_Base_gfx11<0x016, "FLAT_LOAD_DWORDX3", "flat_load_b96", true>;
+defm FLAT_LOAD_B128 : FLAT_Real_Base_gfx11<0x017, "FLAT_LOAD_DWORDX4", "flat_load_b128", true>;
+defm FLAT_STORE_B8 : FLAT_Real_Base_gfx11<0x018, "FLAT_STORE_BYTE", "flat_store_b8", true>;
+defm FLAT_STORE_B16 : FLAT_Real_Base_gfx11<0x019, "FLAT_STORE_SHORT", "flat_store_b16", true>;
+defm FLAT_STORE_B32 : FLAT_Real_Base_gfx11<0x01a, "FLAT_STORE_DWORD", "flat_store_b32", true>;
+defm FLAT_STORE_B64 : FLAT_Real_Base_gfx11<0x01b, "FLAT_STORE_DWORDX2", "flat_store_b64", true>;
+defm FLAT_STORE_B96 : FLAT_Real_Base_gfx11<0x01c, "FLAT_STORE_DWORDX3", "flat_store_b96", true>;
+defm FLAT_STORE_B128 : FLAT_Real_Base_gfx11<0x01d, "FLAT_STORE_DWORDX4", "flat_store_b128", true>;
+defm FLAT_LOAD_D16_U8 : FLAT_Real_Base_gfx11<0x01e, "FLAT_LOAD_UBYTE_D16", "flat_load_d16_u8">;
+defm FLAT_LOAD_D16_I8 : FLAT_Real_Base_gfx11<0x01f, "FLAT_LOAD_SBYTE_D16", "flat_load_d16_i8">;
+defm FLAT_LOAD_D16_B16 : FLAT_Real_Base_gfx11<0x020, "FLAT_LOAD_SHORT_D16", "flat_load_d16_b16">;
+defm FLAT_LOAD_D16_HI_U8 : FLAT_Real_Base_gfx11<0x021, "FLAT_LOAD_UBYTE_D16_HI", "flat_load_d16_hi_u8">;
+defm FLAT_LOAD_D16_HI_I8 : FLAT_Real_Base_gfx11<0x022, "FLAT_LOAD_SBYTE_D16_HI", "flat_load_d16_hi_i8">;
+defm FLAT_LOAD_D16_HI_B16 : FLAT_Real_Base_gfx11<0x023, "FLAT_LOAD_SHORT_D16_HI", "flat_load_d16_hi_b16">;
+defm FLAT_STORE_D16_HI_B8 : FLAT_Real_Base_gfx11<0x024, "FLAT_STORE_BYTE_D16_HI", "flat_store_d16_hi_b8">;
+defm FLAT_STORE_D16_HI_B16 : FLAT_Real_Base_gfx11<0x025, "FLAT_STORE_SHORT_D16_HI", "flat_store_d16_hi_b16">;
+defm FLAT_ATOMIC_SWAP_B32 : FLAT_Real_Atomics_gfx11<0x033, "FLAT_ATOMIC_SWAP", "flat_atomic_swap_b32", true>;
+defm FLAT_ATOMIC_CMPSWAP_B32 : FLAT_Real_Atomics_gfx11<0x034, "FLAT_ATOMIC_CMPSWAP", "flat_atomic_cmpswap_b32", true>;
+defm FLAT_ATOMIC_ADD_U32 : FLAT_Real_Atomics_gfx11<0x035, "FLAT_ATOMIC_ADD", "flat_atomic_add_u32", true>;
+defm FLAT_ATOMIC_SUB_U32 : FLAT_Real_Atomics_gfx11<0x036, "FLAT_ATOMIC_SUB", "flat_atomic_sub_u32", true>;
+defm FLAT_ATOMIC_MIN_I32 : FLAT_Real_Atomics_gfx11<0x038, "FLAT_ATOMIC_SMIN", "flat_atomic_min_i32", true>;
+defm FLAT_ATOMIC_MIN_U32 : FLAT_Real_Atomics_gfx11<0x039, "FLAT_ATOMIC_UMIN", "flat_atomic_min_u32", true>;
+defm FLAT_ATOMIC_MAX_I32 : FLAT_Real_Atomics_gfx11<0x03a, "FLAT_ATOMIC_SMAX", "flat_atomic_max_i32", true>;
+defm FLAT_ATOMIC_MAX_U32 : FLAT_Real_Atomics_gfx11<0x03b, "FLAT_ATOMIC_UMAX", "flat_atomic_max_u32", true>;
+defm FLAT_ATOMIC_AND_B32 : FLAT_Real_Atomics_gfx11<0x03c, "FLAT_ATOMIC_AND", "flat_atomic_and_b32", true>;
+defm FLAT_ATOMIC_OR_B32 : FLAT_Real_Atomics_gfx11<0x03d, "FLAT_ATOMIC_OR", "flat_atomic_or_b32", true>;
+defm FLAT_ATOMIC_XOR_B32 : FLAT_Real_Atomics_gfx11<0x03e, "FLAT_ATOMIC_XOR", "flat_atomic_xor_b32", true>;
+defm FLAT_ATOMIC_INC_U32 : FLAT_Real_Atomics_gfx11<0x03f, "FLAT_ATOMIC_INC", "flat_atomic_inc_u32", true>;
+defm FLAT_ATOMIC_DEC_U32 : FLAT_Real_Atomics_gfx11<0x040, "FLAT_ATOMIC_DEC", "flat_atomic_dec_u32", true>;
+defm FLAT_ATOMIC_SWAP_B64 : FLAT_Real_Atomics_gfx11<0x041, "FLAT_ATOMIC_SWAP_X2", "flat_atomic_swap_b64", true>;
+defm FLAT_ATOMIC_CMPSWAP_B64 : FLAT_Real_Atomics_gfx11<0x042, "FLAT_ATOMIC_CMPSWAP_X2", "flat_atomic_cmpswap_b64", true>;
+defm FLAT_ATOMIC_ADD_U64 : FLAT_Real_Atomics_gfx11<0x043, "FLAT_ATOMIC_ADD_X2", "flat_atomic_add_u64", true>;
+defm FLAT_ATOMIC_SUB_U64 : FLAT_Real_Atomics_gfx11<0x044, "FLAT_ATOMIC_SUB_X2", "flat_atomic_sub_u64", true>;
+defm FLAT_ATOMIC_MIN_I64 : FLAT_Real_Atomics_gfx11<0x045, "FLAT_ATOMIC_SMIN_X2", "flat_atomic_min_i64", true>;
+defm FLAT_ATOMIC_MIN_U64 : FLAT_Real_Atomics_gfx11<0x046, "FLAT_ATOMIC_UMIN_X2", "flat_atomic_min_u64", true>;
+defm FLAT_ATOMIC_MAX_I64 : FLAT_Real_Atomics_gfx11<0x047, "FLAT_ATOMIC_SMAX_X2", "flat_atomic_max_i64", true>;
+defm FLAT_ATOMIC_MAX_U64 : FLAT_Real_Atomics_gfx11<0x048, "FLAT_ATOMIC_UMAX_X2", "flat_atomic_max_u64", true>;
+defm FLAT_ATOMIC_AND_B64 : FLAT_Real_Atomics_gfx11<0x049, "FLAT_ATOMIC_AND_X2", "flat_atomic_and_b64", true>;
+defm FLAT_ATOMIC_OR_B64 : FLAT_Real_Atomics_gfx11<0x04a, "FLAT_ATOMIC_OR_X2", "flat_atomic_or_b64", true>;
+defm FLAT_ATOMIC_XOR_B64 : FLAT_Real_Atomics_gfx11<0x04b, "FLAT_ATOMIC_XOR_X2", "flat_atomic_xor_b64", true>;
+defm FLAT_ATOMIC_INC_U64 : FLAT_Real_Atomics_gfx11<0x04c, "FLAT_ATOMIC_INC_X2", "flat_atomic_inc_u64", true>;
+defm FLAT_ATOMIC_DEC_U64 : FLAT_Real_Atomics_gfx11<0x04d, "FLAT_ATOMIC_DEC_X2", "flat_atomic_dec_u64", true>;
+defm FLAT_ATOMIC_CMPSWAP_F32 : FLAT_Real_Atomics_gfx11<0x050, "FLAT_ATOMIC_FCMPSWAP", "flat_atomic_cmpswap_f32">;
+defm FLAT_ATOMIC_MIN_F32 : FLAT_Real_Atomics_gfx11<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_f32">;
+defm FLAT_ATOMIC_MAX_F32 : FLAT_Real_Atomics_gfx11<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_f32">;
+defm FLAT_ATOMIC_ADD_F32 : FLAT_Real_Atomics_gfx11<0x056, "FLAT_ATOMIC_ADD_F32", "flat_atomic_add_f32">;
+
+// ENC_FLAT_GLBL.
+defm GLOBAL_LOAD_U8 : FLAT_Real_AllAddr_gfx11<0x010, "GLOBAL_LOAD_UBYTE", "global_load_u8", true>;
+defm GLOBAL_LOAD_I8 : FLAT_Real_AllAddr_gfx11<0x011, "GLOBAL_LOAD_SBYTE", "global_load_i8", true>;
+defm GLOBAL_LOAD_U16 : FLAT_Real_AllAddr_gfx11<0x012, "GLOBAL_LOAD_USHORT", "global_load_u16", true>;
+defm GLOBAL_LOAD_I16 : FLAT_Real_AllAddr_gfx11<0x013, "GLOBAL_LOAD_SSHORT", "global_load_i16", true>;
+defm GLOBAL_LOAD_B32 : FLAT_Real_AllAddr_gfx11<0x014, "GLOBAL_LOAD_DWORD", "global_load_b32", true>;
+defm GLOBAL_LOAD_B64 : FLAT_Real_AllAddr_gfx11<0x015, "GLOBAL_LOAD_DWORDX2", "global_load_b64", true>;
+defm GLOBAL_LOAD_B96 : FLAT_Real_AllAddr_gfx11<0x016, "GLOBAL_LOAD_DWORDX3", "global_load_b96", true>;
+defm GLOBAL_LOAD_B128 : FLAT_Real_AllAddr_gfx11<0x017, "GLOBAL_LOAD_DWORDX4", "global_load_b128", true>;
+defm GLOBAL_STORE_B8 : FLAT_Real_AllAddr_gfx11<0x018, "GLOBAL_STORE_BYTE", "global_store_b8", true>;
+defm GLOBAL_STORE_B16 : FLAT_Real_AllAddr_gfx11<0x019, "GLOBAL_STORE_SHORT", "global_store_b16", true>;
+defm GLOBAL_STORE_B32 : FLAT_Real_AllAddr_gfx11<0x01a, "GLOBAL_STORE_DWORD", "global_store_b32", true>;
+defm GLOBAL_STORE_B64 : FLAT_Real_AllAddr_gfx11<0x01b, "GLOBAL_STORE_DWORDX2", "global_store_b64", true>;
+defm GLOBAL_STORE_B96 : FLAT_Real_AllAddr_gfx11<0x01c, "GLOBAL_STORE_DWORDX3", "global_store_b96", true>;
+defm GLOBAL_STORE_B128 : FLAT_Real_AllAddr_gfx11<0x01d, "GLOBAL_STORE_DWORDX4", "global_store_b128", true>;
+defm GLOBAL_LOAD_D16_U8 : FLAT_Real_AllAddr_gfx11<0x01e, "GLOBAL_LOAD_UBYTE_D16", "global_load_d16_u8">;
+defm GLOBAL_LOAD_D16_I8 : FLAT_Real_AllAddr_gfx11<0x01f, "GLOBAL_LOAD_SBYTE_D16", "global_load_d16_i8">;
+defm GLOBAL_LOAD_D16_B16 : FLAT_Real_AllAddr_gfx11<0x020, "GLOBAL_LOAD_SHORT_D16", "global_load_d16_b16">;
+defm GLOBAL_LOAD_D16_HI_U8 : FLAT_Real_AllAddr_gfx11<0x021, "GLOBAL_LOAD_UBYTE_D16_HI", "global_load_d16_hi_u8">;
+defm GLOBAL_LOAD_D16_HI_I8 : FLAT_Real_AllAddr_gfx11<0x022, "GLOBAL_LOAD_SBYTE_D16_HI", "global_load_d16_hi_i8">;
+defm GLOBAL_LOAD_D16_HI_B16 : FLAT_Real_AllAddr_gfx11<0x023, "GLOBAL_LOAD_SHORT_D16_HI", "global_load_d16_hi_b16">;
+defm GLOBAL_STORE_D16_HI_B8 : FLAT_Real_AllAddr_gfx11<0x024, "GLOBAL_STORE_BYTE_D16_HI", "global_store_d16_hi_b8">;
+defm GLOBAL_STORE_D16_HI_B16 : FLAT_Real_AllAddr_gfx11<0x025, "GLOBAL_STORE_SHORT_D16_HI", "global_store_d16_hi_b16">;
+defm GLOBAL_LOAD_ADDTID_B32 : FLAT_Real_AllAddr_gfx11<0x028, "GLOBAL_LOAD_DWORD_ADDTID", "global_load_addtid_b32">;
+defm GLOBAL_STORE_ADDTID_B32 : FLAT_Real_AllAddr_gfx11<0x029, "GLOBAL_STORE_DWORD_ADDTID", "global_store_addtid_b32">;
+defm GLOBAL_ATOMIC_SWAP_B32 : FLAT_Real_GlblAtomics_gfx11<0x033, "GLOBAL_ATOMIC_SWAP", "global_atomic_swap_b32", true>;
+defm GLOBAL_ATOMIC_CMPSWAP_B32 : FLAT_Real_GlblAtomics_gfx11<0x034, "GLOBAL_ATOMIC_CMPSWAP", "global_atomic_cmpswap_b32", true>;
+defm GLOBAL_ATOMIC_ADD_U32 : FLAT_Real_GlblAtomics_gfx11<0x035, "GLOBAL_ATOMIC_ADD", "global_atomic_add_u32", true>;
+defm GLOBAL_ATOMIC_SUB_U32 : FLAT_Real_GlblAtomics_gfx11<0x036, "GLOBAL_ATOMIC_SUB", "global_atomic_sub_u32", true>;
+defm GLOBAL_ATOMIC_CSUB_U32 : FLAT_Real_GlblAtomics_RTN_gfx11<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_csub_u32">;
+defm GLOBAL_ATOMIC_MIN_I32 : FLAT_Real_GlblAtomics_gfx11<0x038, "GLOBAL_ATOMIC_SMIN", "global_atomic_min_i32", true>;
+defm GLOBAL_ATOMIC_MIN_U32 : FLAT_Real_GlblAtomics_gfx11<0x039, "GLOBAL_ATOMIC_UMIN", "global_atomic_min_u32", true>;
+defm GLOBAL_ATOMIC_MAX_I32 : FLAT_Real_GlblAtomics_gfx11<0x03a, "GLOBAL_ATOMIC_SMAX", "global_atomic_max_i32", true>;
+defm GLOBAL_ATOMIC_MAX_U32 : FLAT_Real_GlblAtomics_gfx11<0x03b, "GLOBAL_ATOMIC_UMAX", "global_atomic_max_u32", true>;
+defm GLOBAL_ATOMIC_AND_B32 : FLAT_Real_GlblAtomics_gfx11<0x03c, "GLOBAL_ATOMIC_AND", "global_atomic_and_b32", true>;
+defm GLOBAL_ATOMIC_OR_B32 : FLAT_Real_GlblAtomics_gfx11<0x03d, "GLOBAL_ATOMIC_OR", "global_atomic_or_b32", true>;
+defm GLOBAL_ATOMIC_XOR_B32 : FLAT_Real_GlblAtomics_gfx11<0x03e, "GLOBAL_ATOMIC_XOR", "global_atomic_xor_b32", true>;
+defm GLOBAL_ATOMIC_INC_U32 : FLAT_Real_GlblAtomics_gfx11<0x03f, "GLOBAL_ATOMIC_INC", "global_atomic_inc_u32", true>;
+defm GLOBAL_ATOMIC_DEC_U32 : FLAT_Real_GlblAtomics_gfx11<0x040, "GLOBAL_ATOMIC_DEC", "global_atomic_dec_u32", true>;
+defm GLOBAL_ATOMIC_SWAP_B64 : FLAT_Real_GlblAtomics_gfx11<0x041, "GLOBAL_ATOMIC_SWAP_X2", "global_atomic_swap_b64", true>;
+defm GLOBAL_ATOMIC_CMPSWAP_B64 : FLAT_Real_GlblAtomics_gfx11<0x042, "GLOBAL_ATOMIC_CMPSWAP_X2", "global_atomic_cmpswap_b64", true>;
+defm GLOBAL_ATOMIC_ADD_U64 : FLAT_Real_GlblAtomics_gfx11<0x043, "GLOBAL_ATOMIC_ADD_X2", "global_atomic_add_u64", true>;
+defm GLOBAL_ATOMIC_SUB_U64 : FLAT_Real_GlblAtomics_gfx11<0x044, "GLOBAL_ATOMIC_SUB_X2", "global_atomic_sub_u64", true>;
+defm GLOBAL_ATOMIC_MIN_I64 : FLAT_Real_GlblAtomics_gfx11<0x045, "GLOBAL_ATOMIC_SMIN_X2", "global_atomic_min_i64", true>;
+defm GLOBAL_ATOMIC_MIN_U64 : FLAT_Real_GlblAtomics_gfx11<0x046, "GLOBAL_ATOMIC_UMIN_X2", "global_atomic_min_u64", true>;
+defm GLOBAL_ATOMIC_MAX_I64 : FLAT_Real_GlblAtomics_gfx11<0x047, "GLOBAL_ATOMIC_SMAX_X2", "global_atomic_max_i64", true>;
+defm GLOBAL_ATOMIC_MAX_U64 : FLAT_Real_GlblAtomics_gfx11<0x048, "GLOBAL_ATOMIC_UMAX_X2", "global_atomic_max_u64", true>;
+defm GLOBAL_ATOMIC_AND_B64 : FLAT_Real_GlblAtomics_gfx11<0x049, "GLOBAL_ATOMIC_AND_X2", "global_atomic_and_b64", true>;
+defm GLOBAL_ATOMIC_OR_B64 : FLAT_Real_GlblAtomics_gfx11<0x04a, "GLOBAL_ATOMIC_OR_X2", "global_atomic_or_b64", true>;
+defm GLOBAL_ATOMIC_XOR_B64 : FLAT_Real_GlblAtomics_gfx11<0x04b, "GLOBAL_ATOMIC_XOR_X2", "global_atomic_xor_b64", true>;
+defm GLOBAL_ATOMIC_INC_U64 : FLAT_Real_GlblAtomics_gfx11<0x04c, "GLOBAL_ATOMIC_INC_X2", "global_atomic_inc_u64", true>;
+defm GLOBAL_ATOMIC_DEC_U64 : FLAT_Real_GlblAtomics_gfx11<0x04d, "GLOBAL_ATOMIC_DEC_X2", "global_atomic_dec_u64", true>;
+defm GLOBAL_ATOMIC_CMPSWAP_F32 : FLAT_Real_GlblAtomics_gfx11<0x050, "GLOBAL_ATOMIC_FCMPSWAP", "global_atomic_cmpswap_f32">;
+defm GLOBAL_ATOMIC_MIN_F32 : FLAT_Real_GlblAtomics_gfx11<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_f32">;
+defm GLOBAL_ATOMIC_MAX_F32 : FLAT_Real_GlblAtomics_gfx11<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_f32">;
+defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Real_GlblAtomics_gfx11<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
+
+// ENC_FLAT_SCRATCH.
+defm SCRATCH_LOAD_U8 : FLAT_Real_ScratchAllAddr_gfx11<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
+defm SCRATCH_LOAD_I8 : FLAT_Real_ScratchAllAddr_gfx11<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>;
+defm SCRATCH_LOAD_U16 : FLAT_Real_ScratchAllAddr_gfx11<0x12, "SCRATCH_LOAD_USHORT", "scratch_load_u16", true>;
+defm SCRATCH_LOAD_I16 : FLAT_Real_ScratchAllAddr_gfx11<0x13, "SCRATCH_LOAD_SSHORT", "scratch_load_i16", true>;
+defm SCRATCH_LOAD_B32 : FLAT_Real_ScratchAllAddr_gfx11<0x14, "SCRATCH_LOAD_DWORD", "scratch_load_b32", true>;
+defm SCRATCH_LOAD_B64 : FLAT_Real_ScratchAllAddr_gfx11<0x15, "SCRATCH_LOAD_DWORDX2", "scratch_load_b64", true>;
+defm SCRATCH_LOAD_B96 : FLAT_Real_ScratchAllAddr_gfx11<0x16, "SCRATCH_LOAD_DWORDX3", "scratch_load_b96", true>;
+defm SCRATCH_LOAD_B128 : FLAT_Real_ScratchAllAddr_gfx11<0x17, "SCRATCH_LOAD_DWORDX4", "scratch_load_b128", true>;
+defm SCRATCH_STORE_B8 : FLAT_Real_ScratchAllAddr_gfx11<0x18, "SCRATCH_STORE_BYTE", "scratch_store_b8", true>;
+defm SCRATCH_STORE_B16 : FLAT_Real_ScratchAllAddr_gfx11<0x19, "SCRATCH_STORE_SHORT", "scratch_store_b16", true>;
+defm SCRATCH_STORE_B32 : FLAT_Real_ScratchAllAddr_gfx11<0x1a, "SCRATCH_STORE_DWORD", "scratch_store_b32", true>;
+defm SCRATCH_STORE_B64 : FLAT_Real_ScratchAllAddr_gfx11<0x1b, "SCRATCH_STORE_DWORDX2", "scratch_store_b64", true>;
+defm SCRATCH_STORE_B96 : FLAT_Real_ScratchAllAddr_gfx11<0x1c, "SCRATCH_STORE_DWORDX3", "scratch_store_b96", true>;
+defm SCRATCH_STORE_B128 : FLAT_Real_ScratchAllAddr_gfx11<0x1d, "SCRATCH_STORE_DWORDX4", "scratch_store_b128", true>;
+defm SCRATCH_LOAD_D16_U8 : FLAT_Real_ScratchAllAddr_gfx11<0x1e, "SCRATCH_LOAD_UBYTE_D16", "scratch_load_d16_u8">;
+defm SCRATCH_LOAD_D16_I8 : FLAT_Real_ScratchAllAddr_gfx11<0x1f, "SCRATCH_LOAD_SBYTE_D16", "scratch_load_d16_i8">;
+defm SCRATCH_LOAD_D16_B16 : FLAT_Real_ScratchAllAddr_gfx11<0x20, "SCRATCH_LOAD_SHORT_D16", "scratch_load_d16_b16">;
+defm SCRATCH_LOAD_D16_HI_U8 : FLAT_Real_ScratchAllAddr_gfx11<0x21, "SCRATCH_LOAD_UBYTE_D16_HI", "scratch_load_d16_hi_u8">;
+defm SCRATCH_LOAD_D16_HI_I8 : FLAT_Real_ScratchAllAddr_gfx11<0x22, "SCRATCH_LOAD_SBYTE_D16_HI", "scratch_load_d16_hi_i8">;
+defm SCRATCH_LOAD_D16_HI_B16 : FLAT_Real_ScratchAllAddr_gfx11<0x23, "SCRATCH_LOAD_SHORT_D16_HI", "scratch_load_d16_hi_b16">;
+defm SCRATCH_STORE_D16_HI_B8 : FLAT_Real_ScratchAllAddr_gfx11<0x24, "SCRATCH_STORE_BYTE_D16_HI", "scratch_store_d16_hi_b8">;
+defm SCRATCH_STORE_D16_HI_B16 : FLAT_Real_ScratchAllAddr_gfx11<0x25, "SCRATCH_STORE_SHORT_D16_HI", "scratch_store_d16_hi_b16">;
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index a8c85ec4e5ea..1cd880eaa48e 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -167,7 +167,9 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
return nullptr;
case AMDGPU::COPY:
case AMDGPU::V_MOV_B32_e32:
- case AMDGPU::V_MOV_B64_PSEUDO: {
+ case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::V_MOV_B64_e32:
+ case AMDGPU::V_MOV_B64_e64: {
auto &Op1 = Def->getOperand(1);
if (Op1.isImm())
return &Op1;
@@ -183,6 +185,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
bool CombBCZ,
bool IsShrinkable) const {
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
+ MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
auto OrigOp = OrigMI.getOpcode();
@@ -383,6 +386,7 @@ bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
+ MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
@@ -399,7 +403,8 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
return false;
}
- if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
+ if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
+ MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
assert(DppCtrl && DppCtrl->isImm());
if (!AMDGPU::isLegal64BitDPPControl(DppCtrl->getImm())) {
@@ -447,12 +452,6 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
return false;
}
- if (OldOpndValue->getParent()->getParent() != MovMI.getParent()) {
- LLVM_DEBUG(dbgs() <<
- " failed: old reg def and mov should be in the same BB\n");
- return false;
- }
-
if (OldOpndValue->getImm() == 0) {
if (MaskAllLanes) {
assert(!BoundCtrlZero); // by check [1]
@@ -616,7 +615,8 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
Changed = true;
++NumDPPMovsCombined;
- } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
+ } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
+ MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
if (ST->has64BitDPP() && combineDPPMov(MI)) {
Changed = true;
++NumDPPMovsCombined;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index c0592f6f3c7a..b6d16009e776 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -13,14 +13,38 @@
#include "GCNHazardRecognizer.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/Support/TargetParser.h"
using namespace llvm;
+namespace {
+
+struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
+ MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
+
+ bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
+ if (Arg.getAsInteger(0, Value))
+ return O.error("'" + Arg + "' value invalid for uint argument!");
+
+ if (Value > 100)
+ return O.error("'" + Arg + "' value must be in the range [0, 100]!");
+
+ return false;
+ }
+};
+
+} // end anonymous namespace
+
+static cl::opt<unsigned, false, MFMAPaddingRatioParser>
+ MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
+ cl::desc("Fill a percentage of the latency between "
+ "neighboring MFMA with s_nops."));
+
//===----------------------------------------------------------------------===//
-// Hazard Recoginizer Implementation
+// Hazard Recognizer Implementation
//===----------------------------------------------------------------------===//
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
@@ -92,12 +116,7 @@ static bool isSMovRel(unsigned Opcode) {
}
static bool isDGEMM(unsigned Opcode) {
- return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
- Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
- Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
- Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64 ||
- Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64 ||
- Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64;
+ return AMDGPU::getMAIIsDGEMM(Opcode);
}
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
@@ -109,7 +128,10 @@ static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
return false;
- return true;
+ if (!ST.hasGFX940Insts())
+ return true;
+
+ return AMDGPU::getMAIIsGFX940XDL(Opcode);
}
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
@@ -144,6 +166,11 @@ static bool isPermlane(const MachineInstr &MI) {
Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
}
+static bool isLdsDma(const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) &&
+ (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
+}
+
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
AMDGPU::OpName::simm16);
@@ -204,12 +231,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
return HazardType;
- if (ST.hasReadM0MovRelInterpHazard() &&
- (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
- checkReadM0Hazards(MI) > 0)
- return HazardType;
-
- if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
+ if (((ST.hasReadM0MovRelInterpHazard() &&
+ (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))) ||
+ (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
+ (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
+ (ST.hasReadM0LdsDirectHazard() &&
+ MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
checkReadM0Hazards(MI) > 0)
return HazardType;
@@ -237,6 +264,14 @@ static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
}
}
+unsigned
+GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
+ const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
+ assert(TSchedModel.getWriteProcResBegin(SC) !=
+ TSchedModel.getWriteProcResEnd(SC));
+ return TSchedModel.getWriteProcResBegin(SC)->Cycles;
+}
+
void GCNHazardRecognizer::processBundle() {
MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
@@ -321,11 +356,11 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
if (isRFE(MI->getOpcode()))
return std::max(WaitStates, checkRFEHazards(MI));
- if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
- isSMovRel(MI->getOpcode())))
- return std::max(WaitStates, checkReadM0Hazards(MI));
-
- if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
+ if ((ST.hasReadM0MovRelInterpHazard() &&
+ (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))) ||
+ (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
+ (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
+ (ST.hasReadM0LdsDirectHazard() && MI->readsRegister(AMDGPU::LDS_DIRECT)))
return std::max(WaitStates, checkReadM0Hazards(MI));
if (SIInstrInfo::isMAI(*MI))
@@ -389,16 +424,61 @@ void GCNHazardRecognizer::RecedeCycle() {
// Helper Functions
//===----------------------------------------------------------------------===//
+typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult;
+
typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
+typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
+
+// Search for a hazard in a block and its predecessors.
+template <typename StateT>
+static bool
+hasHazard(StateT State,
+ function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
+ function_ref<void(StateT &, const MachineInstr &)> UpdateState,
+ const MachineBasicBlock *MBB,
+ MachineBasicBlock::const_reverse_instr_iterator I,
+ DenseSet<const MachineBasicBlock *> &Visited) {
+ for (auto E = MBB->instr_rend(); I != E; ++I) {
+ // No need to look at parent BUNDLE instructions.
+ if (I->isBundle())
+ continue;
+
+ switch (IsHazard(State, *I)) {
+ case HazardFound:
+ return true;
+ case HazardExpired:
+ return false;
+ default:
+ // Continue search
+ break;
+ }
+
+ if (I->isInlineAsm() || I->isMetaInstruction())
+ continue;
+
+ UpdateState(State, *I);
+ }
+
+ for (MachineBasicBlock *Pred : MBB->predecessors()) {
+ if (!Visited.insert(Pred).second)
+ continue;
+
+ if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
+ Visited))
+ return true;
+ }
+
+ return false;
+}
// Returns a minimum wait states since \p I walking all predecessors.
// Only scans until \p IsExpired does not return true.
// Can only be run in a hazard recognizer mode.
-static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
- const MachineBasicBlock *MBB,
- MachineBasicBlock::const_reverse_instr_iterator I,
- int WaitStates, IsExpiredFn IsExpired,
- DenseSet<const MachineBasicBlock *> &Visited) {
+static int getWaitStatesSince(
+ GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
+ MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
+ IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
+ GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
for (auto E = MBB->instr_rend(); I != E; ++I) {
// Don't add WaitStates for parent BUNDLE instructions.
if (I->isBundle())
@@ -410,7 +490,7 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
if (I->isInlineAsm())
continue;
- WaitStates += SIInstrInfo::getNumWaitStates(*I);
+ WaitStates += GetNumWaitStates(*I);
if (IsExpired(*I, WaitStates))
return std::numeric_limits<int>::max();
@@ -421,8 +501,8 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
if (!Visited.insert(Pred).second)
continue;
- int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
- WaitStates, IsExpired, Visited);
+ int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
+ IsExpired, Visited, GetNumWaitStates);
MinWaitStates = std::min(MinWaitStates, W);
}
@@ -534,7 +614,7 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
// In order to handle these situations correctly we need to make sure that
// when a clause has more than one instruction, no instruction in the clause
// writes to a register that is read by another instruction in the clause
- // (including itself). If we encounter this situaion, we need to break the
+ // (including itself). If we encounter this situation, we need to break the
// clause by inserting a non SMEM instruction.
for (MachineInstr *MI : EmittedInstrs) {
@@ -764,7 +844,7 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
// 8 bytes can have there store data over written by the next instruction.
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const int VALUWaitStates = 1;
+ const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
int WaitStatesNeeded = 0;
if (!TRI->isVectorRegister(MRI, Def.getReg()))
@@ -783,13 +863,136 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
}
int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
+ int WaitStatesNeeded = 0;
+
+ if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
+ const int TransDefWaitstates = 1;
+
+ auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
+ if (!SIInstrInfo::isTRANS(MI))
+ return false;
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
+
+ for (const MachineOperand &Use : VALU->explicit_uses()) {
+ if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
+ return true;
+ }
+
+ return false;
+ };
+
+ int WaitStatesNeededForDef =
+ TransDefWaitstates -
+ getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ }
+
+ if (ST.hasDstSelForwardingHazard()) {
+ const int Shift16DefWaitstates = 1;
+
+ auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
+ if (!SIInstrInfo::isVALU(MI))
+ return false;
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ if (SIInstrInfo::isSDWA(MI)) {
+ if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
+ if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
+ return false;
+ } else {
+ if ((AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::op_sel) == -1) ||
+ !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
+ ->getImm() &
+ SISrcMods::DST_OP_SEL))
+ return false;
+ }
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+ Register Def = Dst->getReg();
+
+ for (const MachineOperand &Use : VALU->explicit_uses()) {
+ if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
+ return true;
+ }
+ }
+
+ return false;
+ };
+
+ int WaitStatesNeededForDef =
+ Shift16DefWaitstates -
+ getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ }
+
+ if (ST.hasVDecCoExecHazard()) {
+ const int VALUWriteSGPRVALUReadWaitstates = 2;
+ const int VALUWriteEXECRWLane = 4;
+ const int VALUWriteVGPRReadlaneRead = 1;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ Register UseReg;
+ auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
+ if (!SIInstrInfo::isVALU(MI))
+ return false;
+ return MI.modifiesRegister(UseReg, TRI);
+ };
+
+ for (const MachineOperand &Use : VALU->explicit_uses()) {
+ if (!Use.isReg())
+ continue;
+
+ UseReg = Use.getReg();
+ if (TRI->isSGPRReg(MRI, UseReg)) {
+ int WaitStatesNeededForDef =
+ VALUWriteSGPRVALUReadWaitstates -
+ getWaitStatesSince(IsVALUDefSGPRFn,
+ VALUWriteSGPRVALUReadWaitstates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ }
+ }
+
+ if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
+ UseReg = AMDGPU::VCC;
+ int WaitStatesNeededForDef =
+ VALUWriteSGPRVALUReadWaitstates -
+ getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ }
+
+ switch (VALU->getOpcode()) {
+ case AMDGPU::V_READLANE_B32:
+ case AMDGPU::V_READFIRSTLANE_B32: {
+ MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
+ UseReg = Src->getReg();
+ int WaitStatesNeededForDef =
+ VALUWriteVGPRReadlaneRead -
+ getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ }
+ LLVM_FALLTHROUGH;
+ case AMDGPU::V_WRITELANE_B32: {
+ UseReg = AMDGPU::EXEC;
+ int WaitStatesNeededForDef =
+ VALUWriteEXECRWLane -
+ getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ break;
+ }
+ default:
+ break;
+ }
+ }
+
// This checks for the hazard where VMEM instructions that store more than
// 8 bytes can have there store data over written by the next instruction.
if (!ST.has12DWordStoreHazard())
- return 0;
+ return WaitStatesNeeded;
const MachineRegisterInfo &MRI = MF.getRegInfo();
- int WaitStatesNeeded = 0;
for (const MachineOperand &Def : VALU->defs()) {
WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
@@ -861,10 +1064,10 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
const SIInstrInfo *TII = ST.getInstrInfo();
- const int SMovRelWaitStates = 1;
+ const int ReadM0WaitStates = 1;
auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
- return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
- SMovRelWaitStates);
+ return ReadM0WaitStates -
+ getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
}
void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
@@ -873,6 +1076,13 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixSMEMtoVectorWriteHazards(MI);
fixVcmpxExecWARHazard(MI);
fixLdsBranchVmemWARHazard(MI);
+ if (ST.hasLdsDirect()) {
+ fixLdsDirectVALUHazard(MI);
+ fixLdsDirectVMEMHazard(MI);
+ }
+ fixVALUPartialForwardingHazard(MI);
+ fixVALUTransUseHazard(MI);
+ fixWMMAHazards(MI);
}
bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
@@ -880,7 +1090,12 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
return false;
const SIInstrInfo *TII = ST.getInstrInfo();
- auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
+ return (TII->isVOPC(MI) ||
+ ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
+ MI.modifiesRegister(AMDGPU::EXEC, TRI);
+ };
auto IsExpiredFn = [](const MachineInstr &MI, int) {
unsigned Opc = MI.getOpcode();
@@ -893,7 +1108,7 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
return false;
// V_NOP will be discarded by SQ.
- // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
+ // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
// which is always a VGPR and available.
auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
Register Reg = Src0->getReg();
@@ -1157,6 +1372,369 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
return true;
}
+bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
+ if (!SIInstrInfo::isLDSDIR(*MI))
+ return false;
+
+ const int NoHazardWaitStates = 15;
+ const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
+ const Register VDSTReg = VDST->getReg();
+
+ bool VisitedTrans = false;
+ auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
+ if (!SIInstrInfo::isVALU(I))
+ return false;
+ VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
+ // Cover both WAR and WAW
+ return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
+ };
+ auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
+ if (WaitStates >= NoHazardWaitStates)
+ return true;
+ // Instructions which cause va_vdst==0 expire hazard
+ return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
+ SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
+ };
+ auto GetWaitStatesFn = [](const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) ? 1 : 0;
+ };
+
+ DenseSet<const MachineBasicBlock *> Visited;
+ auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
+ std::next(MI->getReverseIterator()), 0,
+ IsExpiredFn, Visited, GetWaitStatesFn);
+
+ // Transcendentals can execute in parallel to other VALUs.
+ // This makes va_vdst count unusable with a mixture of VALU and TRANS.
+ if (VisitedTrans)
+ Count = 0;
+
+ MachineOperand *WaitVdstOp =
+ TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
+ WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
+
+ return true;
+}
+
+bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
+ if (!SIInstrInfo::isLDSDIR(*MI))
+ return false;
+
+ const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
+ const Register VDSTReg = VDST->getReg();
+
+ auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
+ if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
+ !SIInstrInfo::isDS(I))
+ return false;
+ return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
+ };
+ auto IsExpiredFn = [](const MachineInstr &I, int) {
+ return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
+ (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
+ (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ I.getOperand(0).getImm() == 0xffe3);
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(0xffe3);
+
+ return true;
+}
+
+bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
+ if (!ST.isWave64())
+ return false;
+ if (!ST.hasVALUPartialForwardingHazard())
+ return false;
+ if (!SIInstrInfo::isVALU(*MI))
+ return false;
+
+ SmallSetVector<Register, 4> SrcVGPRs;
+
+ for (const MachineOperand &Use : MI->explicit_uses()) {
+ if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+ SrcVGPRs.insert(Use.getReg());
+ }
+
+ // Only applies with >= 2 unique VGPR sources
+ if (SrcVGPRs.size() <= 1)
+ return false;
+
+ // Look for the following pattern:
+ // Va <- VALU [PreExecPos]
+ // intv1
+ // Exec <- SALU [ExecPos]
+ // intv2
+ // Vb <- VALU [PostExecPos]
+ // intv3
+ // MI Va, Vb (WaitState = 0)
+ //
+ // Where:
+ // intv1 + intv2 <= 2 VALUs
+ // intv3 <= 4 VALUs
+ //
+ // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
+
+ const int Intv1plus2MaxVALUs = 2;
+ const int Intv3MaxVALUs = 4;
+ const int IntvMaxVALUs = 6;
+ const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
+
+ struct StateType {
+ SmallDenseMap<Register, int, 4> DefPos;
+ int ExecPos = std::numeric_limits<int>::max();
+ int VALUs = 0;
+ };
+
+ StateType State;
+
+ // This overloads expiry testing with all the hazard detection
+ auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
+ // Too many VALU states have passed
+ if (State.VALUs > NoHazardVALUWaitStates)
+ return HazardExpired;
+
+ // Instructions which cause va_vdst==0 expire hazard
+ if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
+ SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
+ (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ I.getOperand(0).getImm() == 0x0fff))
+ return HazardExpired;
+
+ // Track registers writes
+ bool Changed = false;
+ if (SIInstrInfo::isVALU(I)) {
+ for (Register Src : SrcVGPRs) {
+ if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
+ State.DefPos[Src] = State.VALUs;
+ Changed = true;
+ }
+ }
+ } else if (SIInstrInfo::isSALU(I)) {
+ if (State.ExecPos == std::numeric_limits<int>::max()) {
+ if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
+ State.ExecPos = State.VALUs;
+ Changed = true;
+ }
+ }
+ }
+
+ // Early expiration: too many VALUs in intv3
+ if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
+ return HazardExpired;
+
+ // Only evaluate state if something changed
+ if (!Changed)
+ return NoHazardFound;
+
+ // Determine positions of VALUs pre/post exec change
+ if (State.ExecPos == std::numeric_limits<int>::max())
+ return NoHazardFound;
+
+ int PreExecPos = std::numeric_limits<int>::max();
+ int PostExecPos = std::numeric_limits<int>::max();
+
+ for (auto Entry : State.DefPos) {
+ int DefVALUs = Entry.second;
+ if (DefVALUs != std::numeric_limits<int>::max()) {
+ if (DefVALUs >= State.ExecPos)
+ PreExecPos = std::min(PreExecPos, DefVALUs);
+ else if (DefVALUs < State.ExecPos)
+ PostExecPos = std::min(PostExecPos, DefVALUs);
+ }
+ }
+
+ // Need a VALUs post exec change
+ if (PostExecPos == std::numeric_limits<int>::max())
+ return NoHazardFound;
+
+ // Too many VALUs in intv3?
+ int Intv3VALUs = PostExecPos;
+ if (Intv3VALUs > Intv3MaxVALUs)
+ return HazardExpired;
+
+ // Too many VALUs in intv2?
+ int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
+ if (Intv2VALUs > Intv1plus2MaxVALUs)
+ return HazardExpired;
+
+ // Need a VALUs pre exec change
+ if (PreExecPos == std::numeric_limits<int>::max())
+ return NoHazardFound;
+
+ // Too many VALUs in intv1?
+ int Intv1VALUs = PreExecPos - State.ExecPos;
+ if (Intv1VALUs > Intv1plus2MaxVALUs)
+ return HazardExpired;
+
+ // Too many VALUs in intv1 + intv2
+ if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
+ return HazardExpired;
+
+ return HazardFound;
+ };
+ auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
+ if (SIInstrInfo::isVALU(MI))
+ State.VALUs += 1;
+ };
+
+ DenseSet<const MachineBasicBlock *> Visited;
+ if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
+ std::next(MI->getReverseIterator()), Visited))
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(0x0fff);
+
+ return true;
+}
+
+bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
+ if (!ST.hasVALUTransUseHazard())
+ return false;
+ if (!SIInstrInfo::isVALU(*MI))
+ return false;
+
+ SmallSet<Register, 4> SrcVGPRs;
+
+ for (const MachineOperand &Use : MI->explicit_uses()) {
+ if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+ SrcVGPRs.insert(Use.getReg());
+ }
+
+ // Look for the following pattern:
+ // Va <- TRANS VALU
+ // intv
+ // MI Va (WaitState = 0)
+ //
+ // Where:
+ // intv <= 5 VALUs / 1 TRANS
+ //
+ // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
+
+ const int IntvMaxVALUs = 5;
+ const int IntvMaxTRANS = 1;
+
+ struct StateType {
+ int VALUs = 0;
+ int TRANS = 0;
+ };
+
+ StateType State;
+
+ // This overloads expiry testing with all the hazard detection
+ auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
+ // Too many VALU states have passed
+ if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
+ return HazardExpired;
+
+ // Instructions which cause va_vdst==0 expire hazard
+ if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
+ SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
+ (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ I.getOperand(0).getImm() == 0x0fff))
+ return HazardExpired;
+
+ // Track registers writes
+ if (SIInstrInfo::isTRANS(I)) {
+ for (Register Src : SrcVGPRs) {
+ if (I.modifiesRegister(Src, &TRI)) {
+ return HazardFound;
+ }
+ }
+ }
+
+ return NoHazardFound;
+ };
+ auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
+ if (SIInstrInfo::isVALU(MI))
+ State.VALUs += 1;
+ if (SIInstrInfo::isTRANS(MI))
+ State.TRANS += 1;
+ };
+
+ DenseSet<const MachineBasicBlock *> Visited;
+ if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
+ std::next(MI->getReverseIterator()), Visited))
+ return false;
+
+ // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
+ // avoided (mask 0x0fff achieves this).
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(0x0fff);
+
+ return true;
+}
+
+bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
+ if (!SIInstrInfo::isWMMA(*MI))
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
+ if (!SIInstrInfo::isWMMA(I))
+ return false;
+
+ // Src0 or Src1 of the current wmma instruction overlaps with the dest of
+ // the previous wmma.
+ const Register CurSrc0Reg =
+ TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
+ const Register CurSrc1Reg =
+ TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
+
+ const Register PrevDstReg =
+ TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+
+ if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
+ TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
+ return true;
+ }
+
+ // Src2 of the current wmma instruction overlaps with the dest of the
+ // previous wmma.
+ const MachineOperand *Src2 =
+ TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+ const Register CurSrc2Reg = Src2->isReg() ? Src2->getReg() : Register();
+
+ if (CurSrc2Reg != AMDGPU::NoRegister &&
+ TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) {
+
+ const MachineOperand *Src2Mods =
+ TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
+ const bool NoSrc2Mods =
+ (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
+ // Exception: there is no hazard if the wmma instructions are of the same
+ // type and there is no input modifier on src2 of the current instruction.
+ return !(NoSrc2Mods && (TII->pseudoToMCOpcode(I.getOpcode()) ==
+ TII->pseudoToMCOpcode(MI->getOpcode())));
+ }
+
+ return false;
+ };
+
+ auto IsExpiredFn = [](const MachineInstr &I, int) {
+ return SIInstrInfo::isVALU(I);
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
+
+ return true;
+}
+
int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
int NSAtoVMEMWaitStates = 1;
@@ -1223,6 +1801,36 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
}
+int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
+ // Early exit if no padding is requested.
+ if (MFMAPaddingRatio == 0)
+ return 0;
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
+ return 0;
+
+ int NeighborMFMALatency = 0;
+ auto IsNeighboringMFMA = [&NeighborMFMALatency,
+ this](const MachineInstr &MI) {
+ if (!SIInstrInfo::isMFMA(MI))
+ return false;
+
+ NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
+ return true;
+ };
+
+ const int MaxMFMAPipelineWaitStates = 16;
+ int WaitStatesSinceNeighborMFMA =
+ getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
+
+ int NeighborMFMAPaddingNeeded =
+ (NeighborMFMALatency * MFMAPaddingRatio / 100) -
+ WaitStatesSinceNeighborMFMA;
+
+ return std::max(0, NeighborMFMAPaddingNeeded);
+}
+
int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
int WaitStatesNeeded = 0;
unsigned Opc = MI->getOpcode();
@@ -1257,12 +1865,6 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
}
}
- auto IsMFMAFn = [](const MachineInstr &MI) {
- return SIInstrInfo::isMAI(MI) &&
- MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
- MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
- };
-
for (const MachineOperand &Op : MI->explicit_operands()) {
if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
continue;
@@ -1282,9 +1884,9 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
Register Reg = Op.getReg();
unsigned HazardDefLatency = 0;
- auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
+ auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
this](const MachineInstr &MI) {
- if (!IsMFMAFn(MI))
+ if (!SIInstrInfo::isMFMA(MI))
return false;
Register DstReg = MI.getOperand(0).getReg();
if (DstReg == Reg)
@@ -1361,9 +1963,9 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
Register DstReg = MI->getOperand(0).getReg();
unsigned HazardDefLatency = 0;
- auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
+ auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
this](const MachineInstr &MI) {
- if (!IsMFMAFn(MI))
+ if (!SIInstrInfo::isMFMA(MI))
return false;
Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
HazardDefLatency =
@@ -1387,6 +1989,9 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
+ // Pad neighboring MFMA with noops for better inter-wave performance.
+ WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
+
return WaitStatesNeeded;
}
@@ -1394,21 +1999,16 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
int WaitStatesNeeded = 0;
unsigned Opc = MI->getOpcode();
- auto IsMFMAFn = [](const MachineInstr &MI) {
- return SIInstrInfo::isMAI(MI) &&
- MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
- MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
- };
-
- auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
- return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
+ auto IsLegacyVALUFn = [](const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
};
- auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
- return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
+ auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
+ !SIInstrInfo::isDOT(MI);
};
- if (!IsMFMAFn(*MI))
+ if (!SIInstrInfo::isMFMA(*MI))
return WaitStatesNeeded;
const int VALUWritesExecWaitStates = 4;
@@ -1423,6 +2023,13 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
for (const MachineOperand &Use : MI->explicit_uses()) {
const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
+ const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3;
+ const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5;
+ const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4;
+ const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9;
+ const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8;
+ const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17;
+ const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16;
const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
@@ -1433,9 +2040,18 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
+ const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4;
+ const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6;
+ const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10;
+ const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18;
+ const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5;
+ const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7;
+ const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11;
+ const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19;
const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
+ const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
const int MaxWaitStates = 19;
if (!Use.isReg())
@@ -1444,9 +2060,9 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
bool FullReg;
const MachineInstr *MI1;
- auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
+ auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
this](const MachineInstr &MI) {
- if (!IsMFMAFn(MI))
+ if (!SIInstrInfo::isMFMA(MI))
return false;
Register DstReg = MI.getOperand(0).getReg();
FullReg = (DstReg == Reg);
@@ -1467,7 +2083,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
unsigned Opc1 = MI1->getOpcode();
int NeedWaitStates = 0;
if (OpNo == SrcCIdx) {
- if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
+ if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
NeedWaitStates = 0;
} else if (FullReg) {
if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
@@ -1475,6 +2091,9 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
(Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
+ else if (ST.hasGFX940Insts() &&
+ TSchedModel.computeInstrLatency(MI1) == 2)
+ NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
} else {
switch (Opc1) {
case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
@@ -1490,22 +2109,42 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
break;
default:
+ if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1))
+ break;
switch (TSchedModel.computeInstrLatency(MI1)) {
case 2:
- NeedWaitStates = isDGEMM(Opc)
- ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
- : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
+ NeedWaitStates = ST.hasGFX940Insts()
+ ? isXDL(ST, *MI1)
+ ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates
+ : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
+ : isDGEMM(Opc)
+ ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
+ : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
+ break;
+ case 4:
+ assert(ST.hasGFX940Insts());
+ NeedWaitStates = isXDL(ST, *MI1)
+ ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates
+ : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates;
break;
case 8:
- NeedWaitStates = isDGEMM(Opc)
- ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
- : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
+ NeedWaitStates = ST.hasGFX940Insts()
+ ? isXDL(ST, *MI1)
+ ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates
+ : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates
+ : isDGEMM(Opc)
+ ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
+ : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
break;
case 16: LLVM_FALLTHROUGH;
default:
- NeedWaitStates = isDGEMM(Opc)
- ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
- : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
+ NeedWaitStates = ST.hasGFX940Insts()
+ ? isXDL(ST, *MI1)
+ ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates
+ : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates
+ : isDGEMM(Opc)
+ ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
+ : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
}
}
}
@@ -1524,14 +2163,32 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
default:
switch (TSchedModel.computeInstrLatency(MI1)) {
case 2:
- NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
+ NeedWaitStates = ST.hasGFX940Insts()
+ ? isXDL(ST, *MI1)
+ ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates
+ : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates
+ : SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
+ break;
+ case 4:
+ assert(ST.hasGFX940Insts());
+ NeedWaitStates = isXDL(ST, *MI1)
+ ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates
+ : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates;
break;
case 8:
- NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
+ NeedWaitStates = ST.hasGFX940Insts()
+ ? isXDL(ST, *MI1)
+ ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates
+ : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates
+ : SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
break;
case 16: LLVM_FALLTHROUGH;
default:
- NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
+ NeedWaitStates = ST.hasGFX940Insts()
+ ? isXDL(ST, *MI1)
+ ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates
+ : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates
+ : SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
}
}
}
@@ -1599,18 +2256,12 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
if (!ST.hasGFX90AInsts())
return 0;
- auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
- return SIInstrInfo::isMAI(MI) &&
- MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
- MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
- };
-
auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
return isDGEMM(MI.getOpcode());
};
// This is checked in checkMAIHazards90A()
- if (IsMFMAFn(*MI))
+ if (SIInstrInfo::isMFMA(*MI))
return 0;
int WaitStatesNeeded = 0;
@@ -1623,8 +2274,9 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
const MachineInstr *MFMA = nullptr;
unsigned Reg;
- auto IsMFMAWriteFn = [&Reg, &IsMFMAFn, &MFMA, this](const MachineInstr &MI) {
- if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
+ auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
+ if (!SIInstrInfo::isMFMA(MI) ||
+ !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
return false;
MFMA = &MI;
return true;
@@ -1646,6 +2298,14 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
+ const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4;
+ const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6;
+ const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10;
+ const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18;
+ const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5;
+ const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7;
+ const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11;
+ const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19;
const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
@@ -1685,16 +2345,30 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
int NeedWaitStates = MaxWaitStates;
switch (HazardDefLatency) {
case 2:
- NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
+ NeedWaitStates =
+ ST.hasGFX940Insts()
+ ? isXDL(ST, *MFMA)
+ ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates
+ : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates
+ : SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
break;
case 4:
- assert(isDGEMM(MFMA->getOpcode()));
+ assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
NeedWaitStates =
- IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
- : DMFMA4x4WriteVgprVALUReadWaitStates;
+ isDGEMM(MFMA->getOpcode())
+ ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
+ : DMFMA4x4WriteVgprVALUReadWaitStates
+ : isXDL(ST, *MFMA)
+ ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates
+ : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates;
break;
case 8:
- NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
+ NeedWaitStates =
+ ST.hasGFX940Insts()
+ ? isXDL(ST, *MFMA)
+ ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
+ : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
+ : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
break;
case 16: LLVM_FALLTHROUGH;
default:
@@ -1702,7 +2376,11 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
isDGEMM(MFMA->getOpcode())
? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
: DMFMA16x16WriteVgprVALUReadWaitStates
- : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
+ : ST.hasGFX940Insts()
+ ? isXDL(ST, *MFMA)
+ ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
+ : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
+ : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
break;
}
@@ -1732,7 +2410,16 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
+ const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4;
+ const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6;
+ const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10;
+ const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18;
+ const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5;
+ const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7;
+ const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11;
+ const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19;
const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
+ const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
@@ -1757,19 +2444,35 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
int NeedWaitStates = MaxWaitStates;
switch (TSchedModel.computeInstrLatency(MFMA)) {
case 2:
- NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
+ NeedWaitStates = ST.hasGFX940Insts()
+ ? isXDL(ST, *MFMA)
+ ? GFX940_XDL2PassWriteVgprVALUWawWaitStates
+ : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates
+ : SMFMA4x4WriteVgprVALUWawWaitStates;
break;
case 4:
- assert(isDGEMM(MFMA->getOpcode()));
- NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
+ assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
+ NeedWaitStates = isDGEMM(MFMA->getOpcode())
+ ? DMFMA4x4WriteVgprVALUWriteWaitStates
+ : isXDL(ST, *MFMA)
+ ? GFX940_XDL4PassWriteVgprVALUWawWaitStates
+ : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
break;
case 8:
- NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
+ NeedWaitStates = ST.hasGFX940Insts()
+ ? isXDL(ST, *MFMA)
+ ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
+ : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
+ : SMFMA16x16WriteVgprVALUWawWaitStates;
break;
case 16: LLVM_FALLTHROUGH;
default:
NeedWaitStates = isDGEMM(MFMA->getOpcode())
? DMFMA16x16WriteVgprVALUWriteWaitStates
+ : ST.hasGFX940Insts()
+ ? isXDL(ST, *MFMA)
+ ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
+ : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
: SMFMA32x32WriteVgprVALUWawWaitStates;
break;
}
@@ -1781,12 +2484,14 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
break;
}
- auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
- this](const MachineInstr &MI) {
- if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
+ auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
+ if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
!MI.readsRegister(Reg, &TRI))
return false;
+ if (ST.hasGFX940Insts() && !isXDL(ST, MI))
+ return false;
+
const MachineOperand *SrcC =
TII.getNamedOperand(MI, AMDGPU::OpName::src2);
assert(SrcC);
@@ -1808,6 +2513,9 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
switch (HazardDefLatency) {
case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
break;
+ case 4: assert(ST.hasGFX940Insts());
+ NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
+ break;
case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
break;
case 16: LLVM_FALLTHROUGH;
@@ -1827,11 +2535,10 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
return false;
const MachineInstr *MAI = nullptr;
+
auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
MAI = nullptr;
- if (SIInstrInfo::isMAI(MI) &&
- MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
- MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
+ if (SIInstrInfo::isMFMA(MI))
MAI = &MI;
return MAI != nullptr;
};
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 716bc027a894..57f5a04c6eda 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -62,6 +62,10 @@ private:
void addClauseInst(const MachineInstr &MI);
+ /// \returns the number of wait states before another MFMA instruction can be
+ /// issued after \p MI.
+ unsigned getMFMAPipelineWaitStates(const MachineInstr &MI) const;
+
// Advance over a MachineInstr bundle. Look for hazards in the bundled
// instructions.
void processBundle();
@@ -92,10 +96,31 @@ private:
bool fixSMEMtoVectorWriteHazards(MachineInstr *MI);
bool fixVcmpxExecWARHazard(MachineInstr *MI);
bool fixLdsBranchVmemWARHazard(MachineInstr *MI);
+ bool fixLdsDirectVALUHazard(MachineInstr *MI);
+ bool fixLdsDirectVMEMHazard(MachineInstr *MI);
+ bool fixVALUPartialForwardingHazard(MachineInstr *MI);
+ bool fixVALUTransUseHazard(MachineInstr *MI);
+ bool fixWMMAHazards(MachineInstr *MI);
int checkMAIHazards(MachineInstr *MI);
int checkMAIHazards908(MachineInstr *MI);
int checkMAIHazards90A(MachineInstr *MI);
+ /// Pad the latency between neighboring MFMA instructions with s_nops. The
+ /// percentage of wait states to fill with s_nops is specified by the command
+ /// line option '-amdgpu-mfma-padding-ratio'.
+ ///
+ /// For example, with '-amdgpu-mfma-padding-ratio=100':
+ ///
+ /// 2 pass MFMA instructions have a latency of 2 wait states. Therefore, a
+ /// 'S_NOP 1' will be added between sequential MFMA instructions.
+ ///
+ /// V_MFMA_F32_4X4X1F32
+ /// V_MFMA_F32_4X4X1F32
+ ///-->
+ /// V_MFMA_F32_4X4X1F32
+ /// S_NOP 1
+ /// V_MFMA_F32_4X4X1F32
+ int checkMFMAPadding(MachineInstr *MI);
int checkMAIVALUHazards(MachineInstr *MI);
int checkMAILdStHazards(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 9f98f9ada802..6f82148854c4 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -1,4 +1,4 @@
-//===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
+//===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -8,7 +8,7 @@
//
/// \file
/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
-/// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
+/// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA
/// with sequential versions where possible.
///
//===----------------------------------------------------------------------===//
@@ -16,10 +16,12 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/InitializePasses.h"
using namespace llvm;
@@ -159,15 +161,23 @@ GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
GCNNSAReassign::NSA_Status
GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
- if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
+ if (!Info)
return NSA_Status::NOT_NSA;
+ switch (Info->MIMGEncoding) {
+ case AMDGPU::MIMGEncGfx10NSA:
+ case AMDGPU::MIMGEncGfx11NSA:
+ break;
+ default:
+ return NSA_Status::NOT_NSA;
+ }
+
int VAddr0Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
unsigned VgprBase = 0;
bool NSA = false;
- for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
+ for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
Register Reg = Op.getReg();
if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
@@ -179,15 +189,16 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
if (!PhysReg)
return NSA_Status::FIXED;
+ // TODO: address the below limitation to handle GFX11 BVH instructions
// Bail if address is not a VGPR32. That should be possible to extend the
// optimization to work with subregs of a wider register tuples, but the
// logic to find free registers will be much more complicated with much
// less chances for success. That seems reasonable to assume that in most
// cases a tuple is used because a vector variable contains different
- // parts of an address and it is either already consequitive or cannot
+ // parts of an address and it is either already consecutive or cannot
// be reassigned if not. If needed it is better to rely on register
// coalescer to process such address tuples.
- if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
+ if (TRI->getRegSizeInBits(*MRI->getRegClass(Reg)) != 32 || Op.getSubReg())
return NSA_Status::FIXED;
// InlineSpiller does not call LRM::assign() after an LI split leaving
@@ -278,7 +289,7 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
SmallVector<LiveInterval *, 16> Intervals;
SmallVector<MCRegister, 16> OrigRegs;
SlotIndex MinInd, MaxInd;
- for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
+ for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
Register Reg = Op.getReg();
LiveInterval *LI = &LIS->getInterval(Reg);
@@ -331,11 +342,11 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
}
if (!Success) {
- for (unsigned I = 0; I < Info->VAddrDwords; ++I)
+ for (unsigned I = 0; I < Info->VAddrOperands; ++I)
if (VRM->hasPhys(Intervals[I]->reg()))
LRM->unassign(*Intervals[I]);
- for (unsigned I = 0; I < Info->VAddrDwords; ++I)
+ for (unsigned I = 0; I < Info->VAddrOperands; ++I)
LRM->assign(*Intervals[I], OrigRegs[I]);
continue;
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 3a68ed1934e1..281474994bca 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -192,6 +192,10 @@ def : ProcessorModel<"gfx90c", SIQuarterSpeedModel,
FeatureISAVersion9_0_C.Features
>;
+def : ProcessorModel<"gfx940", SIDPGFX940FullSpeedModel,
+ FeatureISAVersion9_4_0.Features
+>;
+
//===----------------------------------------------------------------------===//
// GCN GFX10.
//===----------------------------------------------------------------------===//
@@ -235,3 +239,27 @@ def : ProcessorModel<"gfx1034", GFX10SpeedModel,
def : ProcessorModel<"gfx1035", GFX10SpeedModel,
FeatureISAVersion10_3_0.Features
>;
+
+def : ProcessorModel<"gfx1036", GFX10SpeedModel,
+ FeatureISAVersion10_3_0.Features
+>;
+
+//===----------------------------------------------------------------------===//
+// GCN GFX11.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"gfx1100", GFX11SpeedModel,
+ FeatureISAVersion11_0.Features
+>;
+
+def : ProcessorModel<"gfx1101", GFX11SpeedModel,
+ FeatureISAVersion11_0.Features
+>;
+
+def : ProcessorModel<"gfx1102", GFX11SpeedModel,
+ FeatureISAVersion11_0_2.Features
+>;
+
+def : ProcessorModel<"gfx1103", GFX11SpeedModel,
+ FeatureISAVersion11_0_2.Features
+>;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 257561cb8430..c41548d19c8e 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -10,7 +10,7 @@
/// This file defines the GCNRegPressure class, which tracks registry pressure
/// by bookkeeping number of SGPR/VGPRs used, weights for large SGPR/VGPRs. It
/// also implements a compare function, which compares different register
-/// pressures, and declares one with max occupance as winner.
+/// pressures, and declares one with max occupancy as winner.
///
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 75855a7a4f9c..100410bb7644 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -13,6 +13,7 @@
#include "GCNSchedStrategy.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
#define DEBUG_TYPE "machine-scheduler"
@@ -362,6 +363,9 @@ void GCNScheduleDAGMILive::schedule() {
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
Pressure[RegionIdx] = PressureAfter;
+ RegionsWithMinOcc[RegionIdx] =
+ PressureAfter.getOccupancy(ST) == MinOccupancy;
+
LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
return;
}
@@ -378,6 +382,7 @@ void GCNScheduleDAGMILive::schedule() {
// occupancy before was higher, or if the current schedule has register
// pressure higher than the excess limits which could lead to more spilling.
unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
+
// Allow memory bound functions to drop to 4 waves if not limited by an
// attribute.
if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy &&
@@ -390,6 +395,7 @@ void GCNScheduleDAGMILive::schedule() {
if (NewOccupancy < MinOccupancy) {
MinOccupancy = NewOccupancy;
MFI.limitOccupancy(MinOccupancy);
+ RegionsWithMinOcc.reset();
LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
<< MinOccupancy << ".\n");
}
@@ -416,6 +422,8 @@ void GCNScheduleDAGMILive::schedule() {
PressureAfter.less(ST, PressureBefore) ||
!RescheduleRegions[RegionIdx]) {
Pressure[RegionIdx] = PressureAfter;
+ RegionsWithMinOcc[RegionIdx] =
+ PressureAfter.getOccupancy(ST) == MinOccupancy;
if (!RegionsWithClusters[RegionIdx] &&
(Stage + 1) == UnclusteredReschedule)
RescheduleRegions[RegionIdx] = false;
@@ -425,13 +433,18 @@ void GCNScheduleDAGMILive::schedule() {
}
}
+ RegionsWithMinOcc[RegionIdx] =
+ PressureBefore.getOccupancy(ST) == MinOccupancy;
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] ||
(Stage + 1) != UnclusteredReschedule;
RegionEnd = RegionBegin;
+ int SkippedDebugInstr = 0;
for (MachineInstr *MI : Unsched) {
- if (MI->isDebugInstr())
+ if (MI->isDebugInstr()) {
+ ++SkippedDebugInstr;
continue;
+ }
if (MI->getIterator() != RegionEnd) {
BB->remove(MI);
@@ -459,10 +472,31 @@ void GCNScheduleDAGMILive::schedule() {
++RegionEnd;
LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
}
+
+ // After reverting schedule, debug instrs will now be at the end of the block
+ // and RegionEnd will point to the first debug instr. Increment RegionEnd
+ // pass debug instrs to the actual end of the scheduling region.
+ while (SkippedDebugInstr-- > 0)
+ ++RegionEnd;
+
+ // If Unsched.front() instruction is a debug instruction, this will actually
+ // shrink the region since we moved all debug instructions to the end of the
+ // block. Find the first instruction that is not a debug instruction.
RegionBegin = Unsched.front()->getIterator();
- Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
+ if (RegionBegin->isDebugInstr()) {
+ for (MachineInstr *MI : Unsched) {
+ if (MI->isDebugInstr())
+ continue;
+ RegionBegin = MI->getIterator();
+ break;
+ }
+ }
+ // Then move the debug instructions back into their correct place and set
+ // RegionBegin and RegionEnd if needed.
placeDebugValues();
+
+ Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
}
GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const {
@@ -493,14 +527,14 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
auto I = MBB->begin();
auto LiveInIt = MBBLiveIns.find(MBB);
+ auto &Rgn = Regions[CurRegion];
+ auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
if (LiveInIt != MBBLiveIns.end()) {
auto LiveIn = std::move(LiveInIt->second);
RPTracker.reset(*MBB->begin(), &LiveIn);
MBBLiveIns.erase(LiveInIt);
} else {
- auto &Rgn = Regions[CurRegion];
I = Rgn.first;
- auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
auto LRS = BBLiveInMap.lookup(NonDbgMI);
#ifdef EXPENSIVE_CHECKS
assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
@@ -511,7 +545,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
for ( ; ; ) {
I = RPTracker.getNext();
- if (Regions[CurRegion].first == I) {
+ if (Regions[CurRegion].first == I || NonDbgMI == I) {
LiveIns[CurRegion] = RPTracker.getLiveRegs();
RPTracker.clearMaxPressure();
}
@@ -561,9 +595,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
RescheduleRegions.resize(Regions.size());
RegionsWithClusters.resize(Regions.size());
RegionsWithHighRP.resize(Regions.size());
+ RegionsWithMinOcc.resize(Regions.size());
RescheduleRegions.set();
RegionsWithClusters.reset();
RegionsWithHighRP.reset();
+ RegionsWithMinOcc.reset();
if (!Regions.empty())
BBLiveInMap = getBBLiveInMap();
@@ -600,13 +636,41 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
<< "Retrying function scheduling with lowest recorded occupancy "
<< MinOccupancy << ".\n");
}
+
+ if (Stage == PreRARematerialize) {
+ if (RegionsWithMinOcc.none() || Regions.size() == 1)
+ break;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ // Check maximum occupancy
+ if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
+ MinOccupancy)
+ break;
+
+ // FIXME: This pass will invalidate cached MBBLiveIns for regions
+ // inbetween the defs and region we sinked the def to. Cached pressure
+ // for regions where a def is sinked from will also be invalidated. Will
+ // need to be fixed if there is another pass after this pass.
+ static_assert(LastStage == PreRARematerialize,
+ "Passes after PreRARematerialize are not supported");
+
+ collectRematerializableInstructions();
+ if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
+ break;
+
+ LLVM_DEBUG(
+ dbgs() << "Retrying function scheduling with improved occupancy of "
+ << MinOccupancy << " from rematerializing\n");
+ }
}
if (Stage == UnclusteredReschedule)
SavedMutations.swap(Mutations);
for (auto Region : Regions) {
- if ((Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) ||
+ if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) &&
+ !RescheduleRegions[RegionIdx]) ||
(Stage == ClusteredLowOccupancyReschedule &&
!RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) {
@@ -631,6 +695,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
// Skip empty scheduling regions (0 or 1 schedulable instructions).
if (begin() == end() || begin() == std::prev(end())) {
exitRegion();
+ ++RegionIdx;
continue;
}
@@ -653,3 +718,282 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
SavedMutations.swap(Mutations);
} while (Stage != LastStage);
}
+
+void GCNScheduleDAGMILive::collectRematerializableInstructions() {
+ const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ Register Reg = Register::index2VirtReg(I);
+ if (!LIS->hasInterval(Reg))
+ continue;
+
+ // TODO: Handle AGPR and SGPR rematerialization
+ if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) ||
+ !MRI.hasOneNonDBGUse(Reg))
+ continue;
+
+ MachineOperand *Op = MRI.getOneDef(Reg);
+ MachineInstr *Def = Op->getParent();
+ if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def, AA))
+ continue;
+
+ MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg);
+ if (Def->getParent() == UseI->getParent())
+ continue;
+
+ // We are only collecting defs that are defined in another block and are
+ // live-through or used inside regions at MinOccupancy. This means that the
+ // register must be in the live-in set for the region.
+ bool AddedToRematList = false;
+ for (unsigned I = 0, E = Regions.size(); I != E; ++I) {
+ auto It = LiveIns[I].find(Reg);
+ if (It != LiveIns[I].end() && !It->second.none()) {
+ if (RegionsWithMinOcc[I]) {
+ RematerializableInsts[I][Def] = UseI;
+ AddedToRematList = true;
+ }
+
+ // Collect regions with rematerializable reg as live-in to avoid
+ // searching later when updating RP.
+ RematDefToLiveInRegions[Def].push_back(I);
+ }
+ }
+ if (!AddedToRematList)
+ RematDefToLiveInRegions.erase(Def);
+ }
+}
+
+bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
+ const TargetInstrInfo *TII) {
+ // Temporary copies of cached variables we will be modifying and replacing if
+ // sinking succeeds.
+ SmallVector<
+ std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
+ NewRegions;
+ DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
+ DenseMap<unsigned, GCNRegPressure> NewPressure;
+ BitVector NewRescheduleRegions;
+
+ NewRegions.resize(Regions.size());
+ NewRescheduleRegions.resize(Regions.size());
+
+ // Collect only regions that has a rematerializable def as a live-in.
+ SmallSet<unsigned, 16> ImpactedRegions;
+ for (const auto &It : RematDefToLiveInRegions)
+ ImpactedRegions.insert(It.second.begin(), It.second.end());
+
+ // Make copies of register pressure and live-ins cache that will be updated
+ // as we rematerialize.
+ for (auto Idx : ImpactedRegions) {
+ NewPressure[Idx] = Pressure[Idx];
+ NewLiveIns[Idx] = LiveIns[Idx];
+ }
+ NewRegions = Regions;
+ NewRescheduleRegions.reset();
+
+ DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
+ bool Improved = false;
+ for (auto I : ImpactedRegions) {
+ if (!RegionsWithMinOcc[I])
+ continue;
+
+ Improved = false;
+ int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
+ int SGPRUsage = NewPressure[I].getSGPRNum();
+
+ // TODO: Handle occupancy drop due to AGPR and SGPR.
+ // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
+ if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy)
+ break;
+
+ // The occupancy of this region could have been improved by a previous
+ // iteration's sinking of defs.
+ if (NewPressure[I].getOccupancy(ST) > MinOccupancy) {
+ NewRescheduleRegions[I] = true;
+ Improved = true;
+ continue;
+ }
+
+ // First check if we have enough trivially rematerializable instructions to
+ // improve occupancy. Optimistically assume all instructions we are able to
+ // sink decreased RP.
+ int TotalSinkableRegs = 0;
+ for (const auto &It : RematerializableInsts[I]) {
+ MachineInstr *Def = It.first;
+ Register DefReg = Def->getOperand(0).getReg();
+ TotalSinkableRegs +=
+ SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
+ }
+ int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
+ unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
+ // If in the most optimistic scenario, we cannot improve occupancy, then do
+ // not attempt to sink any instructions.
+ if (OptimisticOccupancy <= MinOccupancy)
+ break;
+
+ unsigned ImproveOccupancy = 0;
+ SmallVector<MachineInstr *, 4> SinkedDefs;
+ for (auto &It : RematerializableInsts[I]) {
+ MachineInstr *Def = It.first;
+ MachineBasicBlock::iterator InsertPos =
+ MachineBasicBlock::iterator(It.second);
+ Register Reg = Def->getOperand(0).getReg();
+ // Rematerialize MI to its use block. Since we are only rematerializing
+ // instructions that do not have any virtual reg uses, we do not need to
+ // call LiveRangeEdit::allUsesAvailableAt() and
+ // LiveRangeEdit::canRematerializeAt().
+ TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
+ Def->getOperand(0).getSubReg(), *Def, *TRI);
+ MachineInstr *NewMI = &*(--InsertPos);
+ LIS->InsertMachineInstrInMaps(*NewMI);
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ InsertedMIToOldDef[NewMI] = Def;
+
+ // Update region boundaries in scheduling region we sinked from since we
+ // may sink an instruction that was at the beginning or end of its region
+ updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
+ /*Removing =*/true);
+
+ // Update region boundaries in region we sinked to.
+ updateRegionBoundaries(NewRegions, InsertPos, NewMI);
+
+ LaneBitmask PrevMask = NewLiveIns[I][Reg];
+ // FIXME: Also update cached pressure for where the def was sinked from.
+ // Update RP for all regions that has this reg as a live-in and remove
+ // the reg from all regions as a live-in.
+ for (auto Idx : RematDefToLiveInRegions[Def]) {
+ NewLiveIns[Idx].erase(Reg);
+ if (InsertPos->getParent() != Regions[Idx].first->getParent()) {
+ // Def is live-through and not used in this block.
+ NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), MRI);
+ } else {
+ // Def is used and rematerialized into this block.
+ GCNDownwardRPTracker RPT(*LIS);
+ auto *NonDbgMI = &*skipDebugInstructionsForward(
+ NewRegions[Idx].first, NewRegions[Idx].second);
+ RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
+ RPT.advance(NewRegions[Idx].second);
+ NewPressure[Idx] = RPT.moveMaxPressure();
+ }
+ }
+
+ SinkedDefs.push_back(Def);
+ ImproveOccupancy = NewPressure[I].getOccupancy(ST);
+ if (ImproveOccupancy > MinOccupancy)
+ break;
+ }
+
+ // Remove defs we just sinked from all regions' list of sinkable defs
+ for (auto &Def : SinkedDefs)
+ for (auto TrackedIdx : RematDefToLiveInRegions[Def])
+ RematerializableInsts[TrackedIdx].erase(Def);
+
+ if (ImproveOccupancy <= MinOccupancy)
+ break;
+
+ NewRescheduleRegions[I] = true;
+ Improved = true;
+ }
+
+ if (!Improved) {
+ // Occupancy was not improved for all regions that were at MinOccupancy.
+ // Undo sinking and remove newly rematerialized instructions.
+ for (auto &Entry : InsertedMIToOldDef) {
+ MachineInstr *MI = Entry.first;
+ MachineInstr *OldMI = Entry.second;
+ Register Reg = MI->getOperand(0).getReg();
+ LIS->RemoveMachineInstrFromMaps(*MI);
+ MI->eraseFromParent();
+ OldMI->clearRegisterDeads(Reg);
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+ return false;
+ }
+
+ // Occupancy was improved for all regions.
+ for (auto &Entry : InsertedMIToOldDef) {
+ MachineInstr *MI = Entry.first;
+ MachineInstr *OldMI = Entry.second;
+
+ // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
+ BBLiveInMap.erase(OldMI);
+
+ // Remove OldMI and update LIS
+ Register Reg = MI->getOperand(0).getReg();
+ LIS->RemoveMachineInstrFromMaps(*OldMI);
+ OldMI->eraseFromParent();
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+
+ // Update live-ins, register pressure, and regions caches.
+ for (auto Idx : ImpactedRegions) {
+ LiveIns[Idx] = NewLiveIns[Idx];
+ Pressure[Idx] = NewPressure[Idx];
+ MBBLiveIns.erase(Regions[Idx].first->getParent());
+ }
+ Regions = NewRegions;
+ RescheduleRegions = NewRescheduleRegions;
+
+ SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ MFI.increaseOccupancy(MF, ++MinOccupancy);
+
+ return true;
+}
+
+// Copied from MachineLICM
+bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI,
+ AAResults *AA) {
+ if (!TII->isTriviallyReMaterializable(MI, AA))
+ return false;
+
+ for (const MachineOperand &MO : MI.operands())
+ if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual())
+ return false;
+
+ return true;
+}
+
+// When removing, we will have to check both beginning and ending of the region.
+// When inserting, we will only have to check if we are inserting NewMI in front
+// of a scheduling region and do not need to check the ending since we will only
+// ever be inserting before an already existing MI.
+void GCNScheduleDAGMILive::updateRegionBoundaries(
+ SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+ MachineBasicBlock::iterator>> &RegionBoundaries,
+ MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
+ unsigned I = 0, E = RegionBoundaries.size();
+ // Search for first region of the block where MI is located
+ while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
+ ++I;
+
+ for (; I != E; ++I) {
+ if (MI->getParent() != RegionBoundaries[I].first->getParent())
+ return;
+
+ if (Removing && MI == RegionBoundaries[I].first &&
+ MI == RegionBoundaries[I].second) {
+ // MI is in a region with size 1, after removing, the region will be
+ // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
+ RegionBoundaries[I] =
+ std::make_pair(MI->getParent()->end(), MI->getParent()->end());
+ return;
+ }
+ if (MI == RegionBoundaries[I].first) {
+ if (Removing)
+ RegionBoundaries[I] =
+ std::make_pair(std::next(MI), RegionBoundaries[I].second);
+ else
+ // Inserted NewMI in front of region, set new RegionBegin to NewMI
+ RegionBoundaries[I] = std::make_pair(MachineBasicBlock::iterator(NewMI),
+ RegionBoundaries[I].second);
+ return;
+ }
+ if (Removing && MI == RegionBoundaries[I].second) {
+ RegionBoundaries[I] =
+ std::make_pair(RegionBoundaries[I].first, std::prev(MI));
+ return;
+ }
+ }
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index a6e42ad3dfca..97f94f69b70e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -14,6 +14,7 @@
#define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
#include "GCNRegPressure.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/CodeGen/MachineScheduler.h"
namespace llvm {
@@ -77,7 +78,8 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
InitialSchedule,
UnclusteredReschedule,
ClusteredLowOccupancyReschedule,
- LastStage = ClusteredLowOccupancyReschedule
+ PreRARematerialize,
+ LastStage = PreRARematerialize
};
const GCNSubtarget &ST;
@@ -110,24 +112,56 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// Record regions with high register pressure.
BitVector RegionsWithHighRP;
+ // Regions that has the same occupancy as the latest MinOccupancy
+ BitVector RegionsWithMinOcc;
+
// Region live-in cache.
SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
// Region pressure cache.
SmallVector<GCNRegPressure, 32> Pressure;
+ // Each region at MinOccupancy will have their own list of trivially
+ // rematerializable instructions we can remat to reduce RP. The list maps an
+ // instruction to the position we should remat before, usually the MI using
+ // the rematerializable instruction.
+ MapVector<unsigned, MapVector<MachineInstr *, MachineInstr *>>
+ RematerializableInsts;
+
+ // Map a trivially remateriazable def to a list of regions at MinOccupancy
+ // that has the defined reg as a live-in.
+ DenseMap<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
+
// Temporary basic block live-in cache.
DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
+ // Collect all trivially rematerializable VGPR instructions with a single def
+ // and single use outside the defining block into RematerializableInsts.
+ void collectRematerializableInstructions();
+
+ bool isTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA);
+
+ // TODO: Should also attempt to reduce RP of SGPRs and AGPRs
+ // Attempt to reduce RP of VGPR by sinking trivially rematerializable
+ // instructions. Returns true if we were able to sink instruction(s).
+ bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
+ const TargetInstrInfo *TII);
+
// Return current region pressure.
GCNRegPressure getRealRegPressure() const;
// Compute and cache live-ins and pressure for all regions in block.
void computeBlockPressure(const MachineBasicBlock *MBB);
+ // Update region boundaries when removing MI or inserting NewMI before MI.
+ void updateRegionBoundaries(
+ SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+ MachineBasicBlock::iterator>> &RegionBoundaries,
+ MachineBasicBlock::iterator MI, MachineInstr *NewMI,
+ bool Removing = false);
public:
GCNScheduleDAGMILive(MachineSchedContext *C,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 0cd2cfa2f0e7..d269d0945f3b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -58,133 +58,142 @@ protected:
// Basic subtarget description.
Triple TargetTriple;
AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
- unsigned Gen;
+ unsigned Gen = INVALID;
InstrItineraryData InstrItins;
- int LDSBankCount;
- unsigned MaxPrivateElementSize;
+ int LDSBankCount = 0;
+ unsigned MaxPrivateElementSize = 0;
// Possibly statically set by tablegen, but may want to be overridden.
- bool FastFMAF32;
- bool FastDenormalF32;
- bool HalfRate64Ops;
- bool FullRate64Ops;
+ bool FastFMAF32 = false;
+ bool FastDenormalF32 = false;
+ bool HalfRate64Ops = false;
+ bool FullRate64Ops = false;
// Dynamically set bits that enable features.
- bool FlatForGlobal;
- bool AutoWaitcntBeforeBarrier;
- bool UnalignedScratchAccess;
- bool UnalignedAccessMode;
- bool HasApertureRegs;
- bool SupportsXNACK;
+ bool FlatForGlobal = false;
+ bool AutoWaitcntBeforeBarrier = false;
+ bool UnalignedScratchAccess = false;
+ bool UnalignedAccessMode = false;
+ bool HasApertureRegs = false;
+ bool SupportsXNACK = false;
// This should not be used directly. 'TargetID' tracks the dynamic settings
// for XNACK.
- bool EnableXNACK;
+ bool EnableXNACK = false;
- bool EnableTgSplit;
- bool EnableCuMode;
- bool TrapHandler;
+ bool EnableTgSplit = false;
+ bool EnableCuMode = false;
+ bool TrapHandler = false;
// Used as options.
- bool EnableLoadStoreOpt;
- bool EnableUnsafeDSOffsetFolding;
- bool EnableSIScheduler;
- bool EnableDS128;
- bool EnablePRTStrictNull;
- bool DumpCode;
+ bool EnableLoadStoreOpt = false;
+ bool EnableUnsafeDSOffsetFolding = false;
+ bool EnableSIScheduler = false;
+ bool EnableDS128 = false;
+ bool EnablePRTStrictNull = false;
+ bool DumpCode = false;
// Subtarget statically properties set by tablegen
- bool FP64;
- bool FMA;
- bool MIMG_R128;
- bool CIInsts;
- bool GFX8Insts;
- bool GFX9Insts;
- bool GFX90AInsts;
- bool GFX10Insts;
- bool GFX10_3Insts;
- bool GFX7GFX8GFX9Insts;
- bool SGPRInitBug;
- bool NegativeScratchOffsetBug;
- bool NegativeUnalignedScratchOffsetBug;
- bool HasSMemRealTime;
- bool HasIntClamp;
- bool HasFmaMixInsts;
- bool HasMovrel;
- bool HasVGPRIndexMode;
- bool HasScalarStores;
- bool HasScalarAtomics;
- bool HasSDWAOmod;
- bool HasSDWAScalar;
- bool HasSDWASdst;
- bool HasSDWAMac;
- bool HasSDWAOutModsVOPC;
- bool HasDPP;
- bool HasDPP8;
- bool Has64BitDPP;
- bool HasPackedFP32Ops;
- bool HasExtendedImageInsts;
- bool HasR128A16;
- bool HasGFX10A16;
- bool HasG16;
- bool HasNSAEncoding;
- unsigned NSAMaxSize;
- bool GFX10_AEncoding;
- bool GFX10_BEncoding;
- bool HasDLInsts;
- bool HasDot1Insts;
- bool HasDot2Insts;
- bool HasDot3Insts;
- bool HasDot4Insts;
- bool HasDot5Insts;
- bool HasDot6Insts;
- bool HasDot7Insts;
- bool HasMAIInsts;
- bool HasPkFmacF16Inst;
- bool HasAtomicFaddInsts;
- bool SupportsSRAMECC;
+ bool FP64 = false;
+ bool FMA = false;
+ bool MIMG_R128 = false;
+ bool CIInsts = false;
+ bool GFX8Insts = false;
+ bool GFX9Insts = false;
+ bool GFX90AInsts = false;
+ bool GFX940Insts = false;
+ bool GFX10Insts = false;
+ bool GFX11Insts = false;
+ bool GFX10_3Insts = false;
+ bool GFX7GFX8GFX9Insts = false;
+ bool SGPRInitBug = false;
+ bool UserSGPRInit16Bug = false;
+ bool NegativeScratchOffsetBug = false;
+ bool NegativeUnalignedScratchOffsetBug = false;
+ bool HasSMemRealTime = false;
+ bool HasIntClamp = false;
+ bool HasFmaMixInsts = false;
+ bool HasMovrel = false;
+ bool HasVGPRIndexMode = false;
+ bool HasScalarStores = false;
+ bool HasScalarAtomics = false;
+ bool HasSDWAOmod = false;
+ bool HasSDWAScalar = false;
+ bool HasSDWASdst = false;
+ bool HasSDWAMac = false;
+ bool HasSDWAOutModsVOPC = false;
+ bool HasDPP = false;
+ bool HasDPP8 = false;
+ bool Has64BitDPP = false;
+ bool HasPackedFP32Ops = false;
+ bool HasImageInsts = false;
+ bool HasExtendedImageInsts = false;
+ bool HasR128A16 = false;
+ bool HasGFX10A16 = false;
+ bool HasG16 = false;
+ bool HasNSAEncoding = false;
+ unsigned NSAMaxSize = 0;
+ bool GFX10_AEncoding = false;
+ bool GFX10_BEncoding = false;
+ bool HasDLInsts = false;
+ bool HasDot1Insts = false;
+ bool HasDot2Insts = false;
+ bool HasDot3Insts = false;
+ bool HasDot4Insts = false;
+ bool HasDot5Insts = false;
+ bool HasDot6Insts = false;
+ bool HasDot7Insts = false;
+ bool HasDot8Insts = false;
+ bool HasMAIInsts = false;
+ bool HasPkFmacF16Inst = false;
+ bool HasAtomicFaddRtnInsts = false;
+ bool HasAtomicFaddNoRtnInsts = false;
+ bool HasAtomicPkFaddNoRtnInsts = false;
+ bool SupportsSRAMECC = false;
// This should not be used directly. 'TargetID' tracks the dynamic settings
// for SRAMECC.
- bool EnableSRAMECC;
+ bool EnableSRAMECC = false;
- bool HasNoSdstCMPX;
- bool HasVscnt;
- bool HasGetWaveIdInst;
- bool HasSMemTimeInst;
- bool HasShaderCyclesRegister;
- bool HasVOP3Literal;
- bool HasNoDataDepHazard;
- bool FlatAddressSpace;
- bool FlatInstOffsets;
- bool FlatGlobalInsts;
- bool FlatScratchInsts;
- bool ScalarFlatScratchInsts;
- bool HasArchitectedFlatScratch;
- bool AddNoCarryInsts;
- bool HasUnpackedD16VMem;
- bool LDSMisalignedBug;
- bool HasMFMAInlineLiteralBug;
- bool UnalignedBufferAccess;
- bool UnalignedDSAccess;
- bool HasPackedTID;
- bool ScalarizeGlobal;
+ bool HasNoSdstCMPX = false;
+ bool HasVscnt = false;
+ bool HasGetWaveIdInst = false;
+ bool HasSMemTimeInst = false;
+ bool HasShaderCyclesRegister = false;
+ bool HasVOP3Literal = false;
+ bool HasNoDataDepHazard = false;
+ bool FlatAddressSpace = false;
+ bool FlatInstOffsets = false;
+ bool FlatGlobalInsts = false;
+ bool FlatScratchInsts = false;
+ bool ScalarFlatScratchInsts = false;
+ bool HasArchitectedFlatScratch = false;
+ bool EnableFlatScratch = false;
+ bool AddNoCarryInsts = false;
+ bool HasUnpackedD16VMem = false;
+ bool LDSMisalignedBug = false;
+ bool HasMFMAInlineLiteralBug = false;
+ bool UnalignedBufferAccess = false;
+ bool UnalignedDSAccess = false;
+ bool HasPackedTID = false;
+ bool ScalarizeGlobal = false;
- bool HasVcmpxPermlaneHazard;
- bool HasVMEMtoScalarWriteHazard;
- bool HasSMEMtoVectorWriteHazard;
- bool HasInstFwdPrefetchBug;
- bool HasVcmpxExecWARHazard;
- bool HasLdsBranchVmemWARHazard;
- bool HasNSAtoVMEMBug;
- bool HasNSAClauseBug;
- bool HasOffset3fBug;
- bool HasFlatSegmentOffsetBug;
- bool HasImageStoreD16Bug;
- bool HasImageGather4D16Bug;
+ bool HasVcmpxPermlaneHazard = false;
+ bool HasVMEMtoScalarWriteHazard = false;
+ bool HasSMEMtoVectorWriteHazard = false;
+ bool HasInstFwdPrefetchBug = false;
+ bool HasVcmpxExecWARHazard = false;
+ bool HasLdsBranchVmemWARHazard = false;
+ bool HasNSAtoVMEMBug = false;
+ bool HasNSAClauseBug = false;
+ bool HasOffset3fBug = false;
+ bool HasFlatSegmentOffsetBug = false;
+ bool HasImageStoreD16Bug = false;
+ bool HasImageGather4D16Bug = false;
+ bool HasVOPDInsts = false;
// Dummy feature to use for assembler in tablegen.
- bool FeatureDisable;
+ bool FeatureDisable = false;
SelectionDAGTargetInfo TSInfo;
private:
@@ -193,9 +202,6 @@ private:
SIFrameLowering FrameLowering;
public:
- // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
- static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
-
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
const GCNTargetMachine &TM);
~GCNSubtarget() override;
@@ -258,9 +264,19 @@ public:
return (Generation)Gen;
}
+ unsigned getMaxWaveScratchSize() const {
+ // See COMPUTE_TMPRING_SIZE.WAVESIZE.
+ if (getGeneration() < GFX11) {
+ // 13-bit field in units of 256-dword.
+ return (256 * 4) * ((1 << 13) - 1);
+ }
+ // 15-bit field in units of 64-dword.
+ return (64 * 4) * ((1 << 15) - 1);
+ }
+
/// Return the number of high bits known to be zero for a frame index.
unsigned getKnownHighZeroBitsForFrameIndex() const {
- return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
+ return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
}
int getLDSBankCount() const {
@@ -558,13 +574,20 @@ public:
// The ST addressing mode means no registers are used, either VGPR or SGPR,
// but only immediate offset is swizzled and added to the FLAT scratch base.
bool hasFlatScratchSTMode() const {
- return hasFlatScratchInsts() && hasGFX10_3Insts();
+ return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
}
+ bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
+
bool hasScalarFlatScratchInsts() const {
return ScalarFlatScratchInsts;
}
+ bool enableFlatScratch() const {
+ return flatScratchIsArchitected() ||
+ (EnableFlatScratch && hasFlatScratchInsts());
+ }
+
bool hasGlobalAddTidInsts() const {
return GFX10_BEncoding;
}
@@ -690,6 +713,10 @@ public:
return HasDot7Insts;
}
+ bool hasDot8Insts() const {
+ return HasDot8Insts;
+ }
+
bool hasMAIInsts() const {
return HasMAIInsts;
}
@@ -699,9 +726,15 @@ public:
}
bool hasAtomicFaddInsts() const {
- return HasAtomicFaddInsts;
+ return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
}
+ bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
+
+ bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
+
+ bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; }
+
bool hasNoSdstCMPX() const {
return HasNoSdstCMPX;
}
@@ -765,8 +798,6 @@ public:
return true;
}
- bool enableFlatScratch() const;
-
void overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const override;
@@ -805,6 +836,9 @@ public:
/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
+ /// \returns true if the subtarget has the v_permlane64_b32 instruction.
+ bool hasPermLane64() const { return getGeneration() >= GFX11; }
+
bool hasDPP() const {
return HasDPP;
}
@@ -830,7 +864,11 @@ public:
}
bool hasFmaakFmamkF32Insts() const {
- return getGeneration() >= GFX10;
+ return getGeneration() >= GFX10 || hasGFX940Insts();
+ }
+
+ bool hasImageInsts() const {
+ return HasImageInsts;
}
bool hasExtendedImageInsts() const {
@@ -875,6 +913,10 @@ public:
bool hasMadF16() const;
+ bool hasMovB64() const { return GFX940Insts; }
+
+ bool hasLshlAddB64() const { return GFX940Insts; }
+
bool enableSIScheduler() const {
return EnableSIScheduler;
}
@@ -887,6 +929,10 @@ public:
return SGPRInitBug;
}
+ bool hasUserSGPRInit16Bug() const {
+ return UserSGPRInit16Bug;
+ }
+
bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
bool hasNegativeUnalignedScratchOffsetBug() const {
@@ -915,6 +961,14 @@ public:
getGeneration() <= AMDGPUSubtarget::GFX9;
}
+ bool hasReadM0LdsDmaHazard() const {
+ return getGeneration() == AMDGPUSubtarget::GFX9;
+ }
+
+ bool hasReadM0LdsDirectHazard() const {
+ return getGeneration() == AMDGPUSubtarget::GFX9;
+ }
+
bool hasVcmpxPermlaneHazard() const {
return HasVcmpxPermlaneHazard;
}
@@ -943,6 +997,22 @@ public:
return HasLdsBranchVmemWARHazard;
}
+ // Has one cycle hazard on transcendental instruction feeding a
+ // non transcendental VALU.
+ bool hasTransForwardingHazard() const { return GFX940Insts; }
+
+ // Has one cycle hazard on a VALU instruction partially writing dst with
+ // a shift of result bits feeding another VALU instruction.
+ bool hasDstSelForwardingHazard() const { return GFX940Insts; }
+
+ // Cannot use op_sel with v_dot instructions.
+ bool hasDOTOpSelHazard() const { return GFX940Insts; }
+
+ // Does not have HW interlocs for VALU writing and then reading SGPRs.
+ bool hasVDecCoExecHazard() const {
+ return GFX940Insts;
+ }
+
bool hasNSAtoVMEMBug() const {
return HasNSAtoVMEMBug;
}
@@ -953,11 +1023,43 @@ public:
bool hasGFX90AInsts() const { return GFX90AInsts; }
+ bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
+
+ bool hasLdsDirect() const { return getGeneration() >= GFX11; }
+
+ bool hasVALUPartialForwardingHazard() const {
+ return getGeneration() >= GFX11;
+ }
+
+ bool hasVALUTransUseHazard() const { return getGeneration() >= GFX11; }
+
/// Return if operations acting on VGPR tuples require even alignment.
bool needsAlignedVGPRs() const { return GFX90AInsts; }
+ /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
+ bool hasSPackHL() const { return GFX11Insts; }
+
+ /// Return true if the target's EXP instruction has the COMPR flag, which
+ /// affects the meaning of the EN (enable) bits.
+ bool hasCompressedExport() const { return !GFX11Insts; }
+
+ /// Return true if the target's EXP instruction supports the NULL export
+ /// target.
+ bool hasNullExportTarget() const { return !GFX11Insts; }
+
+ bool hasVOPDInsts() const { return HasVOPDInsts; }
+
+ bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
+
+ /// Return true if the target has the S_DELAY_ALU instruction.
+ bool hasDelayAlu() const { return GFX11Insts; }
+
bool hasPackedTID() const { return HasPackedTID; }
+ // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
+ // hasGFX90AInsts is also true.
+ bool hasGFX940Insts() const { return GFX940Insts; }
+
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
@@ -989,6 +1091,9 @@ public:
return getGeneration() >= GFX9;
}
+ // \returns true if the target supports the pre-NGG legacy geometry path.
+ bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
+
/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
@@ -1105,6 +1210,10 @@ public:
/// unit requirement.
unsigned getMaxNumVGPRs(const Function &F) const;
+ unsigned getMaxNumAGPRs(const Function &F) const {
+ return getMaxNumVGPRs(F);
+ }
+
/// \returns Maximum number of VGPRs that meets number of waves per execution
/// unit requirement for function \p MF, or number of VGPRs explicitly
/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
@@ -1165,6 +1274,10 @@ public:
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
SDep &Dep) const override;
+
+ // \returns true if it's beneficial on this subtarget for the scheduler to
+ // cluster stores as well as loads.
+ bool shouldClusterStores() const { return getGeneration() >= GFX11; }
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
new file mode 100644
index 000000000000..1f65376890da
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
@@ -0,0 +1,116 @@
+//===-- LDSDIRInstructions.td - LDS Direct Instruction Definitions --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LDSDIR encoding
+//===----------------------------------------------------------------------===//
+
+class LDSDIRe<bits<2> op, bit is_direct> : Enc32 {
+ // encoding fields
+ bits<2> attrchan;
+ bits<6> attr;
+ bits<4> waitvdst;
+ bits<8> vdst;
+
+ // encoding
+ let Inst{31-24} = 0xce; // encoding
+ let Inst{23-22} = 0x0; // reserved
+ let Inst{21-20} = op;
+ let Inst{19-16} = waitvdst;
+ let Inst{15-10} = !if(is_direct, ?, attr);
+ let Inst{9-8} = !if(is_direct, ?, attrchan);
+ let Inst{7-0} = vdst;
+}
+
+//===----------------------------------------------------------------------===//
+// LDSDIR Classes
+//===----------------------------------------------------------------------===//
+
+class LDSDIR_getIns<bit direct> {
+ dag ret = !if(direct,
+ (ins wait_vdst:$waitvdst),
+ (ins Attr:$attr, AttrChan:$attrchan, wait_vdst:$waitvdst)
+ );
+}
+
+class LDSDIR_Common<string opName, string asm = "", bit direct> : InstSI<
+ (outs VGPR_32:$vdst),
+ LDSDIR_getIns<direct>.ret,
+ asm> {
+ let LDSDIR = 1;
+ let EXP_CNT = 1;
+
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 0;
+
+ string Mnemonic = opName;
+ let UseNamedOperandTable = 1;
+
+ let Uses = [M0, EXEC];
+ let DisableWQM = 0;
+ let SchedRW = [WriteLDS];
+
+ bit is_direct;
+ let is_direct = direct;
+}
+
+class LDSDIR_Pseudo<string opName, bit direct> :
+ LDSDIR_Common<opName, "", direct>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
+
+class LDSDIR_getAsm<bit direct> {
+ string ret = !if(direct,
+ " $vdst$waitvdst",
+ " $vdst, $attr$attrchan$waitvdst"
+ );
+}
+
+class LDSDIR_Real<bits<2> op, LDSDIR_Pseudo lds, int subtarget> :
+ LDSDIR_Common<lds.Mnemonic,
+ lds.Mnemonic # LDSDIR_getAsm<lds.is_direct>.ret,
+ lds.is_direct>,
+ SIMCInstr <lds.Mnemonic, subtarget>,
+ LDSDIRe<op, lds.is_direct> {
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// LDS Direct Instructions
+//===----------------------------------------------------------------------===//
+
+def LDS_DIRECT_LOAD : LDSDIR_Pseudo<"lds_direct_load", 1>;
+def LDS_PARAM_LOAD : LDSDIR_Pseudo<"lds_param_load", 0>;
+
+def : GCNPat <
+ (f32 (int_amdgcn_lds_direct_load M0)),
+ (LDS_DIRECT_LOAD 0)
+>;
+
+def : GCNPat <
+ (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)),
+ (LDS_PARAM_LOAD timm:$attr, timm:$attrchan, 0)
+>;
+
+//===----------------------------------------------------------------------===//
+// GFX11+
+//===----------------------------------------------------------------------===//
+
+multiclass LDSDIR_Real_gfx11<bits<2> op, LDSDIR_Pseudo lds = !cast<LDSDIR_Pseudo>(NAME)> {
+ def _gfx11 : LDSDIR_Real<op, lds, SIEncodingFamily.GFX11> {
+ let AssemblerPredicate = isGFX11Plus;
+ let DecoderNamespace = "GFX11";
+ }
+}
+
+defm LDS_PARAM_LOAD : LDSDIR_Real_gfx11<0x0>;
+defm LDS_DIRECT_LOAD : LDSDIR_Real_gfx11<0x1>;
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index 912bcc792e4d..24c9cc2d7dd2 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -239,9 +239,9 @@ void AMDGPUCustomBehaviour::generateWaitCntInfo() {
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
InstrWaitCntInfo.resize(SrcMgr.size());
- int Index = 0;
- for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) {
- const std::unique_ptr<Instruction> &Inst = *I;
+ for (const auto &EN : llvm::enumerate(SrcMgr.getInstructions())) {
+ const std::unique_ptr<Instruction> &Inst = EN.value();
+ unsigned Index = EN.index();
unsigned Opcode = Inst->getOpcode();
const MCInstrDesc &MCID = MCII.get(Opcode);
if ((MCID.TSFlags & SIInstrFlags::DS) &&
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
index 56650515bd0a..7a0d454c3578 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
@@ -31,7 +31,7 @@ public:
AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
: InstrPostProcess(STI, MCII) {}
- ~AMDGPUInstrPostProcess() {}
+ ~AMDGPUInstrPostProcess() = default;
void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
const MCInst &MCI) override;
@@ -86,7 +86,7 @@ public:
AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII);
- ~AMDGPUCustomBehaviour() {}
+ ~AMDGPUCustomBehaviour() = default;
/// This method is used to determine if an instruction
/// should be allowed to be dispatched. The return value is
/// how many cycles until the instruction can be dispatched.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 50318a59225d..bda3c25e956b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -10,13 +10,16 @@
#include "MCTargetDesc/AMDGPUFixupKinds.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/TargetParser.h"
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -47,7 +50,10 @@ public:
bool writeNopData(raw_ostream &OS, uint64_t Count,
const MCSubtargetInfo *STI) const override;
+ Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+ bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target) override;
};
} //End anonymous namespace
@@ -134,6 +140,9 @@ void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
MutableArrayRef<char> Data, uint64_t Value,
bool IsResolved,
const MCSubtargetInfo *STI) const {
+ if (Fixup.getKind() >= FirstLiteralRelocationKind)
+ return;
+
Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
if (!Value)
return; // Doesn't change encoding.
@@ -153,6 +162,15 @@ void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
Data[Offset + i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
}
+Optional<MCFixupKind> AMDGPUAsmBackend::getFixupKind(StringRef Name) const {
+ return StringSwitch<Optional<MCFixupKind>>(Name)
+#define ELF_RELOC(Name, Value) \
+ .Case(#Name, MCFixupKind(FirstLiteralRelocationKind + Value))
+#include "llvm/BinaryFormat/ELFRelocs/AMDGPU.def"
+#undef ELF_RELOC
+ .Default(None);
+}
+
const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
MCFixupKind Kind) const {
const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
@@ -160,12 +178,21 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
{ "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
};
+ if (Kind >= FirstLiteralRelocationKind)
+ return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
if (Kind < FirstTargetFixupKind)
return MCAsmBackend::getFixupKindInfo(Kind);
return Infos[Kind - FirstTargetFixupKind];
}
+bool AMDGPUAsmBackend::shouldForceRelocation(const MCAssembler &,
+ const MCFixup &Fixup,
+ const MCValue &) {
+ return Fixup.getKind() >= FirstLiteralRelocationKind;
+}
+
unsigned AMDGPUAsmBackend::getMinimumNopSize() const {
return 4;
}
@@ -236,5 +263,5 @@ MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
const MCRegisterInfo &MRI,
const MCTargetOptions &Options) {
return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple(),
- getHsaAbiVersion(&STI).getValueOr(0));
+ getHsaAbiVersion(&STI).value_or(0));
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index bb2c298c2850..066b36622a16 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -65,7 +65,10 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_AMDGPU_REL64;
}
- switch (Fixup.getKind()) {
+ MCFixupKind Kind = Fixup.getKind();
+ if (Kind >= FirstLiteralRelocationKind)
+ return Kind - FirstLiteralRelocationKind;
+ switch (Kind) {
default: break;
case FK_PCRel_4:
return ELF::R_AMDGPU_REL32;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 76663b563150..bd938d829953 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -120,14 +120,6 @@ void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo,
printNamedBit(MI, OpNo, O, "addr64");
}
-void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- if (MI->getOperand(OpNo).getImm()) {
- O << " offset:";
- printU16ImmDecOperand(MI, OpNo, O);
- }
-}
-
void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -152,7 +144,7 @@ void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo,
if (IsFlatSeg) { // Unsigned offset
printU16ImmDecOperand(MI, OpNo, O);
} else { // Signed offset
- if (AMDGPU::isGFX10Plus(STI)) {
+ if (AMDGPU::isGFX10(STI)) {
O << formatDec(SignExtend32<12>(MI->getOperand(OpNo).getImm()));
} else {
O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm()));
@@ -191,6 +183,13 @@ void AMDGPUInstPrinter::printSMEMOffset(const MCInst *MI, unsigned OpNo,
O << formatHex(MI->getOperand(OpNo).getImm());
}
+void AMDGPUInstPrinter::printSMEMOffsetMod(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << " offset:";
+ printSMEMOffset(MI, OpNo, STI, O);
+}
+
void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -206,13 +205,15 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
auto Imm = MI->getOperand(OpNo).getImm();
if (Imm & CPol::GLC)
- O << " glc";
+ O << ((AMDGPU::isGFX940(STI) &&
+ !(MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SMRD)) ? " sc0"
+ : " glc");
if (Imm & CPol::SLC)
- O << " slc";
+ O << (AMDGPU::isGFX940(STI) ? " nt" : " slc");
if ((Imm & CPol::DLC) && AMDGPU::isGFX10Plus(STI))
O << " dlc";
if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
- O << " scc";
+ O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc");
if (Imm & ~CPol::ALL)
O << " /* unexpected cache policy bit */";
}
@@ -309,8 +310,8 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI,
if (AMDGPU::isGFX10Plus(STI)) {
if (Val == UFMT_DEFAULT)
return;
- if (isValidUnifiedFormat(Val)) {
- O << " format:[" << getUnifiedFormatName(Val) << ']';
+ if (isValidUnifiedFormat(Val, STI)) {
+ O << " format:[" << getUnifiedFormatName(Val, STI) << ']';
} else {
O << " format:" << Val;
}
@@ -362,27 +363,26 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
}
void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
auto Opcode = MI->getOpcode();
auto Flags = MII.get(Opcode).TSFlags;
-
if (OpNo == 0) {
- if (Flags & SIInstrFlags::VOP3) {
+ if (Flags & SIInstrFlags::VOP3 && Flags & SIInstrFlags::DPP)
+ O << "_e64_dpp";
+ else if (Flags & SIInstrFlags::VOP3) {
if (!getVOP3IsSingle(Opcode))
O << "_e64";
- } else if (Flags & SIInstrFlags::DPP) {
+ } else if (Flags & SIInstrFlags::DPP)
O << "_dpp";
- } else if (Flags & SIInstrFlags::SDWA) {
+ else if (Flags & SIInstrFlags::SDWA)
O << "_sdwa";
- } else if (((Flags & SIInstrFlags::VOP1) && !getVOP1IsSingle(Opcode)) ||
- ((Flags & SIInstrFlags::VOP2) && !getVOP2IsSingle(Opcode))) {
+ else if (((Flags & SIInstrFlags::VOP1) && !getVOP1IsSingle(Opcode)) ||
+ ((Flags & SIInstrFlags::VOP2) && !getVOP2IsSingle(Opcode)))
O << "_e32";
- }
O << " ";
}
- printOperand(MI, OpNo, STI, O);
+ printRegularOperand(MI, OpNo, STI, O);
// Print default vcc/vcc_lo operand.
switch (Opcode) {
@@ -400,7 +400,16 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10:
case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10:
case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10:
- printDefaultVccOperand(1, STI, O);
+ case AMDGPU::V_ADD_CO_CI_U32_e32_gfx11:
+ case AMDGPU::V_SUB_CO_CI_U32_e32_gfx11:
+ case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx11:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx11:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx11:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx11:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx11:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx11:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx11:
+ printDefaultVccOperand(false, STI, O);
break;
}
}
@@ -412,7 +421,7 @@ void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
else
O << "_e32 ";
- printOperand(MI, OpNo, STI, O);
+ printRegularOperand(MI, OpNo, STI, O);
}
void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
@@ -533,7 +542,7 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
O << "0.15915494309189532";
else {
- assert(isUInt<32>(Imm) || Imm == 0x3fc45f306dc9c882);
+ assert(isUInt<32>(Imm) || isInt<32>(Imm));
// In rare situations, we will have a 32-bit literal in a 64-bit
// operand. This is technically allowed for the encoding of s_mov_b64.
@@ -548,6 +557,18 @@ void AMDGPUInstPrinter::printBLGP(const MCInst *MI, unsigned OpNo,
if (!Imm)
return;
+ if (AMDGPU::isGFX940(STI)) {
+ switch (MI->getOpcode()) {
+ case AMDGPU::V_MFMA_F64_16X16X4F64_gfx940_acd:
+ case AMDGPU::V_MFMA_F64_16X16X4F64_gfx940_vcd:
+ case AMDGPU::V_MFMA_F64_4X4X4F64_gfx940_acd:
+ case AMDGPU::V_MFMA_F64_4X4X4F64_gfx940_vcd:
+ O << " neg:[" << (Imm & 1) << ',' << ((Imm >> 1) & 1) << ','
+ << ((Imm >> 2) & 1) << ']';
+ return;
+ }
+ }
+
O << " blgp:" << Imm;
}
@@ -571,26 +592,73 @@ void AMDGPUInstPrinter::printABID(const MCInst *MI, unsigned OpNo,
O << " abid:" << Imm;
}
-void AMDGPUInstPrinter::printDefaultVccOperand(unsigned OpNo,
+void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- if (OpNo > 0)
+ if (!FirstOperand)
O << ", ";
- printRegOperand(STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ?
- AMDGPU::VCC : AMDGPU::VCC_LO, O, MRI);
- if (OpNo == 0)
+ printRegOperand(STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64]
+ ? AMDGPU::VCC
+ : AMDGPU::VCC_LO,
+ O, MRI);
+ if (FirstOperand)
O << ", ";
}
+void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint8_t Imm = MI->getOperand(OpNo).getImm();
+ if (Imm != 0) {
+ O << " wait_vdst:";
+ printU4ImmDecOperand(MI, OpNo, O);
+ }
+}
+
+void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint8_t Imm = MI->getOperand(OpNo).getImm();
+ if (Imm != 0) {
+ O << " wait_exp:";
+ printU4ImmDecOperand(MI, OpNo, O);
+ }
+}
+
+bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,
+ unsigned OpNo) const {
+ return OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP) &&
+ (Desc.TSFlags & SIInstrFlags::VOPC) &&
+ (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
+ Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO));
+}
+
+// Print default vcc/vcc_lo operand of VOPC.
void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- // Print default vcc/vcc_lo operand of VOPC.
- const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- if (OpNo == 0 && (Desc.TSFlags & SIInstrFlags::VOPC) &&
+ unsigned Opc = MI->getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+ int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ // 0, 1 and 2 are the first printed operands in different cases
+ // If there are printed modifiers, printOperandAndFPInputMods or
+ // printOperandAndIntInputMods will be called instead
+ if ((OpNo == 0 ||
+ (OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP)) ||
+ (OpNo == 2 && (Desc.TSFlags & SIInstrFlags::DPP) && ModIdx != -1)) &&
+ (Desc.TSFlags & SIInstrFlags::VOPC) &&
(Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO)))
- printDefaultVccOperand(OpNo, STI, O);
+ printDefaultVccOperand(true, STI, O);
+
+ printRegularOperand(MI, OpNo, STI, O);
+}
+
+// Print operands after vcc or modifier handling.
+void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
if (OpNo >= MI->getNumOperands()) {
O << "/*Missing OP" << OpNo << "*/";
@@ -710,12 +778,24 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10:
case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10:
case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10:
+ case AMDGPU::V_CNDMASK_B32_e32_gfx11:
+ case AMDGPU::V_ADD_CO_CI_U32_e32_gfx11:
+ case AMDGPU::V_SUB_CO_CI_U32_e32_gfx11:
+ case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx11:
+ case AMDGPU::V_CNDMASK_B32_dpp_gfx11:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx11:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx11:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx11:
+ case AMDGPU::V_CNDMASK_B32_dpp8_gfx11:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx11:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx11:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx11:
case AMDGPU::V_CNDMASK_B32_e32_gfx6_gfx7:
case AMDGPU::V_CNDMASK_B32_e32_vi:
if ((int)OpNo == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::src1))
- printDefaultVccOperand(OpNo, STI, O);
+ printDefaultVccOperand(OpNo == 0, STI, O);
break;
}
@@ -732,6 +812,10 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ if (needsImpliedVcc(Desc, OpNo))
+ printDefaultVccOperand(true, STI, O);
+
unsigned InputModifiers = MI->getOperand(OpNo).getImm();
// Use 'neg(...)' instead of '-' to avoid ambiguity.
@@ -754,7 +838,7 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
if (InputModifiers & SISrcMods::ABS)
O << '|';
- printOperand(MI, OpNo + 1, STI, O);
+ printRegularOperand(MI, OpNo + 1, STI, O);
if (InputModifiers & SISrcMods::ABS)
O << '|';
@@ -767,10 +851,14 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ if (needsImpliedVcc(Desc, OpNo))
+ printDefaultVccOperand(true, STI, O);
+
unsigned InputModifiers = MI->getOperand(OpNo).getImm();
if (InputModifiers & SISrcMods::SEXT)
O << "sext(";
- printOperand(MI, OpNo + 1, STI, O);
+ printRegularOperand(MI, OpNo + 1, STI, O);
if (InputModifiers & SISrcMods::SEXT)
O << ')';
@@ -784,7 +872,7 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10:
if ((int)OpNo + 1 == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::src1))
- printDefaultVccOperand(OpNo, STI, O);
+ printDefaultVccOperand(OpNo == 0, STI, O);
break;
}
}
@@ -1203,9 +1291,9 @@ void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- printOperand(MI, OpNo, STI, O);
+ printRegularOperand(MI, OpNo, STI, O);
O << ", ";
- printOperand(MI, OpNo + 1, STI, O);
+ printRegularOperand(MI, OpNo + 1, STI, O);
}
void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
@@ -1262,15 +1350,16 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
uint16_t MsgId;
uint16_t OpId;
uint16_t StreamId;
- decodeMsg(Imm16, MsgId, OpId, StreamId);
+ decodeMsg(Imm16, MsgId, OpId, StreamId, STI);
+
+ StringRef MsgName = getMsgName(MsgId, STI);
- if (isValidMsgId(MsgId, STI) &&
- isValidMsgOp(MsgId, OpId, STI) &&
+ if (!MsgName.empty() && isValidMsgOp(MsgId, OpId, STI) &&
isValidMsgStream(MsgId, OpId, StreamId, STI)) {
- O << "sendmsg(" << getMsgName(MsgId);
- if (msgRequiresOp(MsgId)) {
- O << ", " << getMsgOpName(MsgId, OpId);
- if (msgSupportsStream(MsgId, OpId)) {
+ O << "sendmsg(" << MsgName;
+ if (msgRequiresOp(MsgId, STI)) {
+ O << ", " << getMsgOpName(MsgId, OpId, STI);
+ if (msgSupportsStream(MsgId, OpId, STI)) {
O << ", " << StreamId;
}
}
@@ -1423,6 +1512,76 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
}
}
+void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ using namespace llvm::AMDGPU::DepCtr;
+
+ uint64_t Imm16 = MI->getOperand(OpNo).getImm() & 0xffff;
+
+ bool HasNonDefaultVal = false;
+ if (isSymbolicDepCtrEncoding(Imm16, HasNonDefaultVal, STI)) {
+ int Id = 0;
+ StringRef Name;
+ unsigned Val;
+ bool IsDefault;
+ bool NeedSpace = false;
+ while (decodeDepCtr(Imm16, Id, Name, Val, IsDefault, STI)) {
+ if (!IsDefault || !HasNonDefaultVal) {
+ if (NeedSpace)
+ O << ' ';
+ O << Name << '(' << Val << ')';
+ NeedSpace = true;
+ }
+ }
+ } else {
+ O << formatHex(Imm16);
+ }
+}
+
+void AMDGPUInstPrinter::printDelayFlag(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const char *BadInstId = "/* invalid instid value */";
+ static const std::array<const char *, 12> InstIds = {
+ "NO_DEP", "VALU_DEP_1", "VALU_DEP_2",
+ "VALU_DEP_3", "VALU_DEP_4", "TRANS32_DEP_1",
+ "TRANS32_DEP_2", "TRANS32_DEP_3", "FMA_ACCUM_CYCLE_1",
+ "SALU_CYCLE_1", "SALU_CYCLE_2", "SALU_CYCLE_3"};
+
+ const char *BadInstSkip = "/* invalid instskip value */";
+ static const std::array<const char *, 6> InstSkips = {
+ "SAME", "NEXT", "SKIP_1", "SKIP_2", "SKIP_3", "SKIP_4"};
+
+ unsigned SImm16 = MI->getOperand(OpNo).getImm();
+ const char *Prefix = "";
+
+ unsigned Value = SImm16 & 0xF;
+ if (Value) {
+ const char *Name = Value < InstIds.size() ? InstIds[Value] : BadInstId;
+ O << Prefix << "instid0(" << Name << ')';
+ Prefix = " | ";
+ }
+
+ Value = (SImm16 >> 4) & 7;
+ if (Value) {
+ const char *Name =
+ Value < InstSkips.size() ? InstSkips[Value] : BadInstSkip;
+ O << Prefix << "instskip(" << Name << ')';
+ Prefix = " | ";
+ }
+
+ Value = (SImm16 >> 7) & 0xF;
+ if (Value) {
+ const char *Name = Value < InstIds.size() ? InstIds[Value] : BadInstId;
+ O << Prefix << "instid1(" << Name << ')';
+ Prefix = " | ";
+ }
+
+ if (!*Prefix)
+ O << "0";
+}
+
void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
unsigned Id;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 71db0beba0b6..202edeee3cb3 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -15,6 +15,7 @@
#include "llvm/MC/MCInstPrinter.h"
namespace llvm {
+class MCInstrDesc;
class AMDGPUInstPrinter : public MCInstPrinter {
public:
@@ -50,7 +51,6 @@ private:
void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printFlatOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -64,6 +64,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printSMEMOffset(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSMEMOffsetMod(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -116,6 +118,8 @@ private:
raw_ostream &O);
void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printRegularOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O) {
printOperand(MI, OpNum, STI, O);
@@ -172,8 +176,13 @@ private:
raw_ostream &O);
void printABID(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printDefaultVccOperand(unsigned OpNo, const MCSubtargetInfo &STI,
+ bool needsImpliedVcc(const MCInstrDesc &Desc, unsigned OpNo) const;
+ void printDefaultVccOperand(bool FirstOperand, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printWaitVDST(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printWaitEXP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O, unsigned N);
@@ -234,6 +243,10 @@ protected:
raw_ostream &O);
void printWaitFlag(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printDepCtr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printDelayFlag(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printEndpgm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 53c724f2211a..02c213f90f89 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -14,8 +14,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
+#include "llvm/ADT/APInt.h"
#include "llvm/MC/MCCodeEmitter.h"
-#include <cstdint>
namespace llvm {
@@ -34,46 +34,34 @@ protected:
AMDGPUMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
public:
+ void getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
+ APInt &Inst, APInt &Scratch,
+ const MCSubtargetInfo &STI) const;
- uint64_t getBinaryCodeForInstr(const MCInst &MI,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
+ virtual void getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ APInt &Op, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const = 0;
- virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return 0;
- }
+ virtual void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const = 0;
- virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+ virtual void getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return 0;
- }
+ const MCSubtargetInfo &STI) const = 0;
- virtual unsigned getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return 0;
- }
+ virtual void getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const = 0;
- virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+ virtual void getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+ APInt &Op,
SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return 0;
- }
-
- virtual unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return 0;
- }
+ const MCSubtargetInfo &STI) const = 0;
- virtual unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return 0;
- }
+ virtual void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const = 0;
protected:
FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 1f917cd91b47..11fe3f9ef058 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -19,6 +19,7 @@
#include "R600InstPrinter.h"
#include "R600MCTargetDesc.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
+#include "llvm/MC/LaneBitmask.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCELFStreamer.h"
@@ -27,6 +28,7 @@
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/TargetRegistry.h"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index e5cce6045c8c..060d4b660632 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -33,7 +33,6 @@ enum AMDGPUDwarfFlavour : unsigned { Wave64 = 0, Wave32 = 1 };
MCRegisterInfo *createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour);
MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
- const MCRegisterInfo &MRI,
MCContext &Ctx);
MCAsmBackend *createAMDGPUAsmBackend(const Target &T,
@@ -51,7 +50,6 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
#define GET_INSTRINFO_ENUM
#define GET_INSTRINFO_OPERAND_ENUM
-#define GET_INSTRINFO_SCHED_ENUM
#include "AMDGPUGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 7aa5f1abf65b..078133469549 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -17,12 +17,16 @@
#include "Utils/AMDKernelCodeTUtils.h"
#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFStreamer.h"
#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetParser.h"
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -102,6 +106,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: AK = GK_GFX90A; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: AK = GK_GFX90C; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: AK = GK_GFX940; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
@@ -112,6 +117,11 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033: AK = GK_GFX1033; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034: AK = GK_GFX1034; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035: AK = GK_GFX1035; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1036: AK = GK_GFX1036; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1100: AK = GK_GFX1100; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101: AK = GK_GFX1101; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102: AK = GK_GFX1102; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: AK = GK_GFX1103; break;
case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
}
@@ -165,6 +175,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
case GK_GFX90A: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A;
case GK_GFX90C: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C;
+ case GK_GFX940: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940;
case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
@@ -175,6 +186,11 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX1033: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033;
case GK_GFX1034: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034;
case GK_GFX1035: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035;
+ case GK_GFX1036: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1036;
+ case GK_GFX1100: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1100;
+ case GK_GFX1101: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101;
+ case GK_GFX1102: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102;
+ case GK_GFX1103: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103;
case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE;
}
@@ -285,7 +301,7 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
uint32_t Encoded_pad = Encoded_s_code_end;
// Instruction cache line size in bytes.
- const unsigned Log2CacheLineSize = 6;
+ const unsigned Log2CacheLineSize = AMDGPU::isGFX11Plus(STI) ? 7 : 6;
const unsigned CacheLineSize = 1u << Log2CacheLineSize;
// Extra padding amount in bytes to support prefetch mode 3.
@@ -439,6 +455,8 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_forward_progress", KD,
compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS);
+ PRINT_FIELD(OS, ".amdhsa_shared_vgpr_count", KD, compute_pgm_rsrc3,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
}
PRINT_FIELD(
OS, ".amdhsa_exception_fp_ieee_invalid_op", KD,
@@ -515,8 +533,8 @@ void AMDGPUTargetELFStreamer::EmitNote(
if (STI.getTargetTriple().getOS() == Triple::AMDHSA)
NoteFlags = ELF::SHF_ALLOC;
- S.PushSection();
- S.SwitchSection(
+ S.pushSection();
+ S.switchSection(
Context.getELFSection(ElfNote::SectionName, ELF::SHT_NOTE, NoteFlags));
S.emitInt32(NameSZ); // namesz
S.emitValue(DescSZ, 4); // descz
@@ -525,7 +543,7 @@ void AMDGPUTargetELFStreamer::EmitNote(
S.emitValueToAlignment(4, 0, 1, 0); // padding 0
EmitDesc(S); // desc
S.emitValueToAlignment(4, 0, 1, 0); // padding 0
- S.PopSection();
+ S.popSection();
}
unsigned AMDGPUTargetELFStreamer::getEFlags() {
@@ -691,7 +709,7 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major,
OS.emitBytes(VendorName);
OS.emitInt8(0); // NULL terminate VendorName
OS.emitBytes(ArchName);
- OS.emitInt8(0); // NULL terminte ArchName
+ OS.emitInt8(0); // NULL terminate ArchName
});
}
@@ -699,9 +717,9 @@ void
AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
MCStreamer &OS = getStreamer();
- OS.PushSection();
+ OS.pushSection();
OS.emitBytes(StringRef((const char*)&Header, sizeof(Header)));
- OS.PopSection();
+ OS.popSection();
}
void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
@@ -806,7 +824,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
uint32_t Encoded_pad = Encoded_s_code_end;
// Instruction cache line size in bytes.
- const unsigned Log2CacheLineSize = 6;
+ const unsigned Log2CacheLineSize = AMDGPU::isGFX11Plus(STI) ? 7 : 6;
const unsigned CacheLineSize = 1u << Log2CacheLineSize;
// Extra padding amount in bytes to support prefetch mode 3.
@@ -818,11 +836,11 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
}
MCStreamer &OS = getStreamer();
- OS.PushSection();
+ OS.pushSection();
OS.emitValueToAlignment(CacheLineSize, Encoded_pad, 4);
for (unsigned I = 0; I < FillSize; I += 4)
OS.emitInt32(Encoded_pad);
- OS.PopSection();
+ OS.popSection();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index 6fe192e95e72..78eb304fe84f 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -20,6 +20,7 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/EndianStream.h"
@@ -84,9 +85,8 @@ enum FCInstr {
};
MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
- const MCRegisterInfo &MRI,
MCContext &Ctx) {
- return new R600MCCodeEmitter(MCII, MRI);
+ return new R600MCCodeEmitter(MCII, *Ctx.getRegisterInfo());
}
void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
index fc52cb33824f..605ae851378d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
@@ -24,7 +24,6 @@ class MCInstrInfo;
class MCRegisterInfo;
MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
- const MCRegisterInfo &MRI,
MCContext &Ctx);
MCInstrInfo *createR600MCInstrInfo();
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 77f219aaa3ab..5e67fb5ec876 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -17,10 +17,15 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
using namespace llvm;
@@ -34,9 +39,8 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
const MCSubtargetInfo &STI) const;
public:
- SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
- MCContext &ctx)
- : AMDGPUMCCodeEmitter(mcii), MRI(mri) {}
+ SIMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+ : AMDGPUMCCodeEmitter(mcii), MRI(*ctx.getRegisterInfo()) {}
SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete;
@@ -46,42 +50,45 @@ public:
const MCSubtargetInfo &STI) const override;
/// \returns the encoding for an MCOperand.
- uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
+ void getMachineOpValue(const MCInst &MI, const MCOperand &MO, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
/// Use a fixup to encode the simm16 field for SOPP branch
/// instructions.
- unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+ void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ void getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
- unsigned getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
+ void getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
- unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+ void getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
- unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
-
- unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
+ void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
private:
uint64_t getImplicitOpSelHiEncoding(int Opcode) const;
+ void getMachineOpValueCommon(const MCInst &MI, const MCOperand &MO,
+ unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
};
} // end anonymous namespace
MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
- const MCRegisterInfo &MRI,
MCContext &Ctx) {
- return new SIMCCodeEmitter(MCII, MRI, Ctx);
+ return new SIMCCodeEmitter(MCII, Ctx);
}
// Returns the encoding value to use if the given integer is an integer inline
@@ -309,8 +316,9 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
computeAvailableFeatures(STI.getFeatureBits()));
int Opcode = MI.getOpcode();
- uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI);
- const MCInstrDesc &Desc = MCII.get(Opcode);
+ APInt Encoding, Scratch;
+ getBinaryCodeForInstr(MI, Fixups, Encoding, Scratch, STI);
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
unsigned bytes = Desc.getSize();
// Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions.
@@ -322,7 +330,7 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
}
for (unsigned i = 0; i < bytes; i++) {
- OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
+ OS.write((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i));
}
// NSA encoding.
@@ -335,9 +343,11 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
unsigned NumExtraAddrs = srsrc - vaddr0 - 1;
unsigned NumPadding = (-NumExtraAddrs) & 3;
- for (unsigned i = 0; i < NumExtraAddrs; ++i)
- OS.write((uint8_t)getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i),
- Fixups, STI));
+ for (unsigned i = 0; i < NumExtraAddrs; ++i) {
+ getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i), Encoding, Fixups,
+ STI);
+ OS.write((uint8_t)Encoding.getLimitedValue());
+ }
for (unsigned i = 0; i < NumPadding; ++i)
OS.write(0);
}
@@ -385,34 +395,36 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
}
}
-unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+void SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+ APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
const MCOperand &MO = MI.getOperand(OpNo);
if (MO.isExpr()) {
const MCExpr *Expr = MO.getExpr();
MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br;
Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
- return 0;
+ Op = APInt::getNullValue(96);
+ } else {
+ getMachineOpValue(MI, MO, Op, Fixups, STI);
}
-
- return getMachineOpValue(MI, MO, Fixups, STI);
}
-unsigned SIMCCodeEmitter::getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+void SIMCCodeEmitter::getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
+ APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
auto Offset = MI.getOperand(OpNo).getImm();
// VI only supports 20-bit unsigned offsets.
assert(!AMDGPU::isVI(STI) || isUInt<20>(Offset));
- return Offset;
+ Op = Offset;
}
-unsigned
-SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+void SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+ APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
using namespace AMDGPU::SDWA;
uint64_t RegEnc = 0;
@@ -426,23 +438,24 @@ SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
}
- return RegEnc;
+ Op = RegEnc;
+ return;
} else {
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
if (Enc != ~0U && Enc != 255) {
- return Enc | SDWA9EncValues::SRC_SGPR_MASK;
+ Op = Enc | SDWA9EncValues::SRC_SGPR_MASK;
+ return;
}
}
llvm_unreachable("Unsupported operand kind");
- return 0;
}
-unsigned
-SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+void SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+ APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
using namespace AMDGPU::SDWA;
uint64_t RegEnc = 0;
@@ -455,13 +468,13 @@ SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK;
}
- return RegEnc;
+ Op = RegEnc;
}
-unsigned
-SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+void SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
+ APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
unsigned Reg = MI.getOperand(OpNo).getReg();
uint64_t Enc = MRI.getEncodingValue(Reg);
@@ -476,10 +489,11 @@ SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
Enc |= 512;
- return Enc;
+ Op = Enc;
}
static bool needsPCRel(const MCExpr *Expr) {
@@ -505,12 +519,21 @@ static bool needsPCRel(const MCExpr *Expr) {
llvm_unreachable("invalid kind");
}
-uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
- const MCOperand &MO,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- if (MO.isReg())
- return MRI.getEncodingValue(MO.getReg());
+void SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+ const MCOperand &MO, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg()){
+ Op = MRI.getEncodingValue(MO.getReg());
+ return;
+ }
+ unsigned OpNo = &MO - MI.begin();
+ getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI);
+}
+
+void SIMCCodeEmitter::getMachineOpValueCommon(
+ const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
// FIXME: If this is expression is PCRel or not should not depend on what
@@ -533,28 +556,22 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
uint32_t Offset = Desc.getSize();
assert(Offset == 4 || Offset == 8);
- Fixups.push_back(
- MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc()));
- }
-
- // Figure out the operand number, needed for isSrcOperand check
- unsigned OpNo = 0;
- for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) {
- if (&MO == &MI.getOperand(OpNo))
- break;
+ Fixups.push_back(MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc()));
}
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
- if (Enc != ~0U)
- return Enc;
-
- } else if (MO.isImm())
- return MO.getImm();
+ if (Enc != ~0U) {
+ Op = Enc;
+ return;
+ }
+ } else if (MO.isImm()) {
+ Op = MO.getImm();
+ return;
+ }
llvm_unreachable("Encoding of this operand type is not supported yet.");
- return 0;
}
#define ENABLE_INSTR_PREDICATE_VERIFIER
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index cf03fd682143..be1addf35012 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -14,6 +14,8 @@
// - MIMGEncGfx90a: encoding for gfx90a for atomics
// - MIMGEncGfx10Default: gfx10 default (non-NSA) encoding
// - MIMGEncGfx10NSA: gfx10 NSA encoding
+// - MIMGEncGfx11Default: gfx11 default (non-NSA) encoding
+// - MIMGEncGfx11NSA: gfx11 NSA encoding
class MIMGEncoding;
def MIMGEncGfx6 : MIMGEncoding;
@@ -21,6 +23,8 @@ def MIMGEncGfx8 : MIMGEncoding;
def MIMGEncGfx90a : MIMGEncoding;
def MIMGEncGfx10Default : MIMGEncoding;
def MIMGEncGfx10NSA : MIMGEncoding;
+def MIMGEncGfx11Default : MIMGEncoding;
+def MIMGEncGfx11NSA : MIMGEncoding;
def MIMGEncoding : GenericEnum {
let FilterClass = "MIMGEncoding";
@@ -90,11 +94,13 @@ def MIMG {
int NOP = -1;
}
-class mimgopc <int base, int vi = base, int si = base> {
- field bits<8> BASE = base; // Opcode for all but atomics
+class mimgopc <int gfx11, int gfx10m, int vi = gfx10m, int si = gfx10m> {
+ field bits<8> GFX11 = gfx11;
+ field bits<8> GFX10M = gfx10m; // GFX10minus for all but atomics
field bits<8> VI = vi; // VI is only used for atomic instructions
field bits<8> SI = si; // SI is only used for atomic instructions
- bit HAS_BASE = !ne(base, MIMG.NOP);
+ bit HAS_GFX11 = !ne(gfx11, MIMG.NOP);
+ bit HAS_GFX10M = !ne(gfx10m, MIMG.NOP);
bit HAS_VI = !ne(vi, MIMG.NOP);
bit HAS_SI = !ne(si, MIMG.NOP);
}
@@ -207,12 +213,16 @@ class MIMG <dag outs, string dns = "">
MIMGEncoding MIMGEncoding;
bits<8> VDataDwords;
bits<8> VAddrDwords;
+
+ // If NSA is used this counts number of operands VAddrDwords is split into.
+ bits<8> VAddrOperands;
}
def MIMGInfoTable : GenericTable {
let FilterClass = "MIMG";
let CppTypeName = "MIMGInfo";
- let Fields = ["Opcode", "BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
+ let Fields = ["Opcode", "BaseOpcode", "MIMGEncoding", "VDataDwords",
+ "VAddrDwords", "VAddrOperands"];
string TypeOf_BaseOpcode = "MIMGBaseOpcode";
string TypeOf_MIMGEncoding = "MIMGEncoding";
@@ -227,11 +237,12 @@ def getMIMGInfo : SearchIndex {
// This class used to use !foldl to memoize the AddrAsmNames list.
// It turned out that that was much slower than using !filter.
-class MIMGNSAHelper<int num_addrs> {
+class MIMGNSAHelper<int num_addrs,
+ list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)> {
list<string> AddrAsmNames =
!foreach(i, !filter(i, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
!lt(i, num_addrs)), "vaddr" # i);
- dag AddrIns = !dag(ins, !foreach(arg, AddrAsmNames, VGPR_32), AddrAsmNames);
+ dag AddrIns = !dag(ins, addr_types, AddrAsmNames);
string AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
int NSA = !if(!le(num_addrs, 1), ?,
@@ -247,6 +258,7 @@ class MIMG_gfx6789<bits<8> op, dag outs, string dns = "">
let AssemblerPredicate = isGFX6GFX7GFX8GFX9NotGFX90A;
let MIMGEncoding = MIMGEncGfx6;
+ let VAddrOperands = 1;
let d16 = !if(BaseOpcode.HasD16, ?, 0);
}
@@ -257,6 +269,7 @@ class MIMG_gfx90a<bits<8> op, dag outs, string dns = "">
let AssemblerPredicate = isGFX90APlus;
let MIMGEncoding = MIMGEncGfx90a;
+ let VAddrOperands = 1;
let d16 = !if(BaseOpcode.HasD16, ?, 0);
}
@@ -264,10 +277,11 @@ class MIMG_gfx90a<bits<8> op, dag outs, string dns = "">
// Base class of all non-NSA gfx10 MIMG instructions.
class MIMG_gfx10<int op, dag outs, string dns = "">
: MIMG<outs, dns>, MIMGe_gfx10<op> {
- let SubtargetPredicate = isGFX10Plus;
- let AssemblerPredicate = isGFX10Plus;
+ let SubtargetPredicate = isGFX10Only;
+ let AssemblerPredicate = isGFX10Only;
let MIMGEncoding = MIMGEncGfx10Default;
+ let VAddrOperands = 1;
let d16 = !if(BaseOpcode.HasD16, ?, 0);
let nsa = 0;
@@ -277,10 +291,11 @@ class MIMG_gfx10<int op, dag outs, string dns = "">
// Note that 1-dword addresses always use non-NSA variants.
class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns="">
: MIMG<outs, dns>, MIMGe_gfx10<op> {
- let SubtargetPredicate = isGFX10Plus;
- let AssemblerPredicate = isGFX10Plus;
+ let SubtargetPredicate = isGFX10Only;
+ let AssemblerPredicate = isGFX10Only;
let MIMGEncoding = MIMGEncGfx10NSA;
+ let VAddrOperands = num_addrs;
MIMGNSAHelper nsah = MIMGNSAHelper<num_addrs>;
dag AddrIns = nsah.AddrIns;
@@ -290,11 +305,45 @@ class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns="">
let nsa = nsah.NSA;
}
+// Base class of all non-NSA gfx11 MIMG instructions.
+class MIMG_gfx11<int op, dag outs, string dns = "">
+ : MIMG<outs, dns>, MIMGe_gfx11<op> {
+ let SubtargetPredicate = isGFX11Plus;
+ let AssemblerPredicate = isGFX11Plus;
+
+ let MIMGEncoding = MIMGEncGfx11Default;
+ let VAddrOperands = 1;
+
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+ let nsa = 0;
+}
+
+// Base class for all NSA MIMG instructions.
+// Note that 1-dword addresses always use non-NSA variants.
+class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
+ list<RegisterClass> addr_types=[]>
+ : MIMG<outs, dns>, MIMGe_gfx11<op> {
+ let SubtargetPredicate = isGFX11Plus;
+ let AssemblerPredicate = isGFX11Plus;
+
+ let MIMGEncoding = MIMGEncGfx11NSA;
+ let VAddrOperands = num_addrs;
+
+ MIMGNSAHelper nsah = !if(!empty(addr_types),
+ MIMGNSAHelper<num_addrs>,
+ MIMGNSAHelper<num_addrs, addr_types>);
+ dag AddrIns = nsah.AddrIns;
+ string AddrAsm = nsah.AddrAsm;
+
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+ let nsa = nsah.NSA;
+}
+
class MIMG_NoSampler_Helper <mimgopc op, string asm,
RegisterClass dst_rc,
RegisterClass addr_rc,
string dns="">
- : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> {
+ : MIMG_gfx6789 <op.GFX10M, (outs dst_rc:$vdata), dns> {
let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
@@ -307,7 +356,7 @@ class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
RegisterClass dst_rc,
RegisterClass addr_rc,
string dns="">
- : MIMG_gfx90a <op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+ : MIMG_gfx90a <op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, LWE:$lwe, DA:$da),
@@ -319,7 +368,7 @@ class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
class MIMG_NoSampler_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, RegisterClass AddrRC,
string dns="">
- : MIMG_gfx10<op.BASE, (outs DataRC:$vdata), dns> {
+ : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> {
let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask,
Dim:$dim, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
@@ -331,7 +380,32 @@ class MIMG_NoSampler_gfx10<mimgopc op, string opcode,
class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
string dns="">
- : MIMG_nsa_gfx10<op.BASE, (outs DataRC:$vdata), num_addrs, dns> {
+ : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> {
+ let InOperandList = !con(AddrIns,
+ (ins SReg_256:$srsrc, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_NoSampler_gfx11<mimgopc op, string opcode,
+ RegisterClass DataRC, RegisterClass AddrRC,
+ string dns="">
+ : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> {
+ let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ string dns="">
+ : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> {
let InOperandList = !con(AddrIns,
(ins SReg_256:$srsrc, DMask:$dmask,
Dim:$dim, UNorm:$unorm, CPol:$cpol,
@@ -347,7 +421,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
bit ExtendedImageInst = 1> {
let ssamp = 0 in {
let VAddrDwords = 1 in {
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
!if(enableDisasm, "AMDGPU", "")>;
if !not(ExtendedImageInst) then
@@ -356,30 +430,42 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32,
!if(enableDisasm, "AMDGPU", "")>;
}
+ if op.HAS_GFX11 then {
+ def _V1_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
let VAddrDwords = 2 in {
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
if !not(ExtendedImageInst) then
def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64>;
def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>;
def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>;
}
+ if op.HAS_GFX11 then {
+ def _V2_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VReg_64>;
+ def _V2_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 2>;
+ }
}
let VAddrDwords = 3 in {
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
if !not(ExtendedImageInst) then
def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96>;
def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>;
def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>;
}
+ if op.HAS_GFX11 then {
+ def _V3_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VReg_96>;
+ def _V3_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 3>;
+ }
}
let VAddrDwords = 4 in {
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
if !not(ExtendedImageInst) then
def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128>;
@@ -387,6 +473,11 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4,
!if(enableDisasm, "AMDGPU", "")>;
}
+ if op.HAS_GFX11 then {
+ def _V4_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VReg_128>;
+ def _V4_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 4,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
}
}
@@ -420,7 +511,7 @@ class MIMG_Store_Helper <mimgopc op, string asm,
RegisterClass data_rc,
RegisterClass addr_rc,
string dns = "">
- : MIMG_gfx6789<op.BASE, (outs), dns> {
+ : MIMG_gfx6789<op.GFX10M, (outs), dns> {
let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
@@ -433,7 +524,7 @@ class MIMG_Store_Helper_gfx90a <mimgopc op, string asm,
RegisterClass data_rc,
RegisterClass addr_rc,
string dns = "">
- : MIMG_gfx90a<op.BASE, (outs), dns> {
+ : MIMG_gfx90a<op.GFX10M, (outs), dns> {
let InOperandList = !con((ins getLdStRegisterOperand<data_rc>.ret:$vdata,
addr_rc:$vaddr, SReg_256:$srsrc,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
@@ -446,7 +537,7 @@ class MIMG_Store_Helper_gfx90a <mimgopc op, string asm,
class MIMG_Store_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, RegisterClass AddrRC,
string dns="">
- : MIMG_gfx10<op.BASE, (outs), dns> {
+ : MIMG_gfx10<op.GFX10M, (outs), dns> {
let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
@@ -458,7 +549,33 @@ class MIMG_Store_gfx10<mimgopc op, string opcode,
class MIMG_Store_nsa_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
string dns="">
- : MIMG_nsa_gfx10<op.BASE, (outs), num_addrs, dns> {
+ : MIMG_nsa_gfx10<op.GFX10M, (outs), num_addrs, dns> {
+ let InOperandList = !con((ins DataRC:$vdata),
+ AddrIns,
+ (ins SReg_256:$srsrc, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Store_gfx11<mimgopc op, string opcode,
+ RegisterClass DataRC, RegisterClass AddrRC,
+ string dns="">
+ : MIMG_gfx11<op.GFX11, (outs), dns> {
+ let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Store_nsa_gfx11<mimgopc op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ string dns="">
+ : MIMG_nsa_gfx11<op.GFX11, (outs), num_addrs, dns> {
let InOperandList = !con((ins DataRC:$vdata),
AddrIns,
(ins SReg_256:$srsrc, DMask:$dmask,
@@ -475,39 +592,57 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0,
DisableWQM = 1, ssamp = 0 in {
let VAddrDwords = 1 in {
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
!if(enableDisasm, "AMDGPU", "")>;
+ let hasPostISelHook = 1 in
def _V1_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VGPR_32,
!if(enableDisasm, "GFX90A", "")>;
def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32,
!if(enableDisasm, "AMDGPU", "")>;
}
+ if op.HAS_GFX11 then {
+ def _V1_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
let VAddrDwords = 2 in {
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>;
def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>;
def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>;
}
+ if op.HAS_GFX11 then {
+ def _V2_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_64>;
+ def _V2_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 2>;
+ }
}
let VAddrDwords = 3 in {
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>;
def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>;
def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>;
}
+ if op.HAS_GFX11 then {
+ def _V3_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_96>;
+ def _V3_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 3>;
+ }
}
let VAddrDwords = 4 in {
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>;
def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>;
def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4,
!if(enableDisasm, "AMDGPU", "")>;
}
+ if op.HAS_GFX11 then {
+ def _V4_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_128>;
+ def _V4_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 4,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
}
}
@@ -582,7 +717,7 @@ class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc,
class MIMG_Atomic_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, RegisterClass AddrRC,
bit enableDisasm = 0>
- : MIMG_gfx10<!cast<int>(op.BASE), (outs DataRC:$vdst),
+ : MIMG_gfx10<!cast<int>(op.GFX10M), (outs DataRC:$vdst),
!if(enableDisasm, "AMDGPU", "")> {
let Constraints = "$vdst = $vdata";
let AsmMatchConverter = "cvtMIMGAtomic";
@@ -596,7 +731,37 @@ class MIMG_Atomic_gfx10<mimgopc op, string opcode,
class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
bit enableDisasm = 0>
- : MIMG_nsa_gfx10<!cast<int>(op.BASE), (outs DataRC:$vdst), num_addrs,
+ : MIMG_nsa_gfx10<!cast<int>(op.GFX10M), (outs DataRC:$vdst), num_addrs,
+ !if(enableDisasm, "AMDGPU", "")> {
+ let Constraints = "$vdst = $vdata";
+ let AsmMatchConverter = "cvtMIMGAtomic";
+
+ let InOperandList = !con((ins DataRC:$vdata),
+ AddrIns,
+ (ins SReg_256:$srsrc, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+}
+
+class MIMG_Atomic_gfx11<mimgopc op, string opcode,
+ RegisterClass DataRC, RegisterClass AddrRC,
+ bit enableDisasm = 0>
+ : MIMG_gfx11<!cast<int>(op.GFX11), (outs DataRC:$vdst),
+ !if(enableDisasm, "AMDGPU", "")> {
+ let Constraints = "$vdst = $vdata";
+ let AsmMatchConverter = "cvtMIMGAtomic";
+
+ let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe);
+ let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+}
+
+class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ bit enableDisasm = 0>
+ : MIMG_nsa_gfx11<!cast<int>(op.GFX11), (outs DataRC:$vdst), num_addrs,
!if(enableDisasm, "AMDGPU", "")> {
let Constraints = "$vdst = $vdata";
let AsmMatchConverter = "cvtMIMGAtomic";
@@ -622,11 +787,15 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
}
if op.HAS_VI then {
def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
+ let hasPostISelHook = 1 in
def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>;
}
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
}
+ if op.HAS_GFX11 then {
+ def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>;
+ }
}
let VAddrDwords = 2 in {
if op.HAS_SI then {
@@ -636,10 +805,14 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>;
}
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
}
+ if op.HAS_GFX11 then {
+ def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>;
+ def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>;
+ }
}
let VAddrDwords = 3 in {
if op.HAS_SI then {
@@ -649,10 +822,14 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>;
}
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
}
+ if op.HAS_GFX11 then {
+ def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>;
+ def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>;
+ }
}
let VAddrDwords = 4 in {
if op.HAS_SI then {
@@ -662,10 +839,14 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>;
}
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
}
+ if op.HAS_GFX11 then {
+ def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>;
+ def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>;
+ }
}
}
}
@@ -691,7 +872,7 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0>
class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
RegisterClass src_rc, string dns="">
- : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> {
+ : MIMG_gfx6789 <op.GFX10M, (outs dst_rc:$vdata), dns> {
let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
@@ -702,7 +883,7 @@ class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc,
RegisterClass src_rc, string dns="">
- : MIMG_gfx90a<op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+ : MIMG_gfx90a<op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, LWE:$lwe, DA:$da),
@@ -714,7 +895,7 @@ class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc,
class MIMG_Sampler_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, RegisterClass AddrRC,
string dns="">
- : MIMG_gfx10<op.BASE, (outs DataRC:$vdata), dns> {
+ : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> {
let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp,
DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
@@ -727,7 +908,34 @@ class MIMG_Sampler_gfx10<mimgopc op, string opcode,
class MIMG_Sampler_nsa_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
string dns="">
- : MIMG_nsa_gfx10<op.BASE, (outs DataRC:$vdata), num_addrs, dns> {
+ : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> {
+ let InOperandList = !con(AddrIns,
+ (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm"
+ #"$cpol$r128$a16$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Sampler_gfx11<mimgopc op, string opcode,
+ RegisterClass DataRC, RegisterClass AddrRC,
+ string dns="">
+ : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> {
+ let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp,
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm"
+ #"$cpol$r128$a16$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ string dns="">
+ : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> {
let InOperandList = !con(AddrIns,
(ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
Dim:$dim, UNorm:$unorm, CPol:$cpol,
@@ -823,7 +1031,7 @@ multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
bit ExtendedImageInst = 1> {
foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in {
let VAddrDwords = addr.NumWords in {
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V # addr.NumWords
: MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
!if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
@@ -835,16 +1043,26 @@ multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
: MIMG_Sampler_gfx10 <op, asm, dst_rc, addr.RegClass,
!if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
}
+ if op.HAS_GFX11 then {
+ def _V # addr.NumWords # _gfx11
+ : MIMG_Sampler_gfx11 <op, asm, dst_rc, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ }
}
}
foreach addr = MIMG_Sampler_AddrSizes<sample>.NSAInstrs in {
let VAddrDwords = addr.NumWords in {
- if op.HAS_BASE then {
+ if op.HAS_GFX10M then {
def _V # addr.NumWords # _nsa_gfx10
: MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords,
!if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
}
+ if !and(op.HAS_GFX11, !le(addr.NumWords, 5)) then {
+ def _V # addr.NumWords # _nsa_gfx11
+ : MIMG_Sampler_nsa_gfx11<op, asm, dst_rc, addr.NumWords,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ }
}
}
}
@@ -911,10 +1129,17 @@ class MIMG_IntersectRay_Helper<bit Is64, bit A16> {
// when we only need 9, 11 or 12 depending on A16 field and ptr size.
RegisterClass RegClass = MIMGAddrSize<num_addrs, 0>.RegClass;
int VAddrDwords = !srl(RegClass.Size, 5);
+
+ int gfx11_nsa_addrs = !if(A16, 4, 5);
+ RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32);
+ list<RegisterClass> gfx11_addr_types =
+ !if(A16,
+ [node_ptr_type, VGPR_32, VReg_96, VReg_96],
+ [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]);
}
class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC, bit A16>
- : MIMG_gfx10<op.BASE, (outs VReg_128:$vdata), "AMDGPU"> {
+ : MIMG_gfx10<op.GFX10M, (outs VReg_128:$vdata), "AMDGPU"> {
let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc),
!if(A16, (ins GFX10A16:$a16), (ins)));
@@ -924,7 +1149,27 @@ class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC, b
}
class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs, bit A16>
- : MIMG_nsa_gfx10<op.BASE, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> {
+ : MIMG_nsa_gfx10<op.GFX10M, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> {
+ let InOperandList = !con(nsah.AddrIns,
+ (ins SReg_128:$srsrc),
+ !if(A16, (ins GFX10A16:$a16), (ins)));
+ let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(A16, "$a16", "");
+}
+
+class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterClass AddrRC, bit A16>
+ : MIMG_gfx11<op.GFX11, (outs VReg_128:$vdata), "AMDGPU"> {
+
+ let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc),
+ !if(A16, (ins GFX10A16:$a16), (ins)));
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc"#!if(A16, "$a16", "");
+
+ let nsa = 0;
+}
+
+class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
+ bit A16, list<RegisterClass> addr_types>
+ : MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "AMDGPU",
+ addr_types> {
let InOperandList = !con(nsah.AddrIns,
(ins SReg_128:$srsrc),
!if(A16, (ins GFX10A16:$a16), (ins)));
@@ -936,9 +1181,7 @@ multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit A16> {
def "" : MIMGBaseOpcode {
let BVH = 1;
}
- let SubtargetPredicate = HasGFX10_AEncoding,
- AssemblerPredicate = HasGFX10_AEncoding,
- AsmMatchConverter = !if(A16, "cvtIntersectRay", ""),
+ let AsmMatchConverter = !if(A16, "cvtIntersectRay", ""),
dmask = 0xf,
unorm = 1,
d16 = 0,
@@ -955,142 +1198,183 @@ multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit A16> {
def _sa_gfx10 : MIMG_IntersectRay_gfx10<op, opcode, info.RegClass, A16> {
let VAddrDwords = info.VAddrDwords;
}
+ def _sa_gfx11 : MIMG_IntersectRay_gfx11<op, opcode, info.RegClass, A16> {
+ let VAddrDwords = info.VAddrDwords;
+ }
def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10<op, opcode, info.num_addrs, A16> {
let VAddrDwords = info.num_addrs;
}
+ def _nsa_gfx11 : MIMG_IntersectRay_nsa_gfx11<op, opcode,
+ info.gfx11_nsa_addrs, A16,
+ info.gfx11_addr_types> {
+ let VAddrDwords = info.num_addrs;
+ }
+ }
+}
+
+multiclass MIMG_MSAA_Load <mimgopc op, string asm> {
+ def "" : MIMGBaseOpcode {
+ let HasD16 = 1;
+ let Gather4 = 1; /* for appropriate dmask handling */
+ let MSAA = 1;
+ }
+
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
+ Gather4 = 1, hasPostISelHook = 0, mayLoad = 1 in {
+ let VDataDwords = 2 in
+ defm _V2 : MIMG_NoSampler_Src_Helper<op, asm, VReg_64, 0>; /* packed D16 */
+ let VDataDwords = 3 in
+ defm _V3 : MIMG_NoSampler_Src_Helper<op, asm, VReg_96, 0>; /* packed D16 + tfe */
+ let VDataDwords = 4 in
+ defm _V4 : MIMG_NoSampler_Src_Helper<op, asm, VReg_128, 1>;
+ let VDataDwords = 5 in
+ defm _V5 : MIMG_NoSampler_Src_Helper<op, asm, VReg_160, 0>;
}
}
//===----------------------------------------------------------------------===//
// MIMG Instructions
//===----------------------------------------------------------------------===//
-defm IMAGE_LOAD : MIMG_NoSampler <mimgopc<0x00>, "image_load", 1>;
-defm IMAGE_LOAD_MIP : MIMG_NoSampler <mimgopc<0x01>, "image_load_mip", 1, 1>;
-defm IMAGE_LOAD_PCK : MIMG_NoSampler <mimgopc<0x02>, "image_load_pck", 0>;
-defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <mimgopc<0x03>, "image_load_pck_sgn", 0>;
-defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <mimgopc<0x04>, "image_load_mip_pck", 0, 1>;
-defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <mimgopc<0x05>, "image_load_mip_pck_sgn", 0, 1>;
-defm IMAGE_STORE : MIMG_Store <mimgopc<0x08>, "image_store", 1>;
-defm IMAGE_STORE_MIP : MIMG_Store <mimgopc<0x09>, "image_store_mip", 1, 1>;
-defm IMAGE_STORE_PCK : MIMG_Store <mimgopc<0x0a>, "image_store_pck", 0>;
-defm IMAGE_STORE_MIP_PCK : MIMG_Store <mimgopc<0x0b>, "image_store_mip_pck", 0, 1>;
+let OtherPredicates = [HasImageInsts] in {
-defm IMAGE_GET_RESINFO : MIMG_NoSampler <mimgopc<0x0e>, "image_get_resinfo", 0, 1, 1>;
+defm IMAGE_LOAD : MIMG_NoSampler <mimgopc<0x00, 0x00>, "image_load", 1>;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <mimgopc<0x01, 0x01>, "image_load_mip", 1, 1>;
+defm IMAGE_LOAD_PCK : MIMG_NoSampler <mimgopc<0x02, 0x02>, "image_load_pck", 0>;
+defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <mimgopc<0x03, 0x03>, "image_load_pck_sgn", 0>;
+defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <mimgopc<0x04, 0x04>, "image_load_mip_pck", 0, 1>;
+defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <mimgopc<0x05, 0x05>, "image_load_mip_pck_sgn", 0, 1>;
+defm IMAGE_STORE : MIMG_Store <mimgopc<0x06, 0x08>, "image_store", 1>;
+defm IMAGE_STORE_MIP : MIMG_Store <mimgopc<0x07, 0x09>, "image_store_mip", 1, 1>;
+defm IMAGE_STORE_PCK : MIMG_Store <mimgopc<0x08, 0x0a>, "image_store_pck", 0>;
+defm IMAGE_STORE_MIP_PCK : MIMG_Store <mimgopc<0x09, 0x0b>, "image_store_mip_pck", 0, 1>;
-defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimgopc<0x0f, 0x10, 0x0f>, "image_atomic_swap">;
-defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimgopc<0x10, 0x11, 0x10>, "image_atomic_cmpswap", 1>;
-defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimgopc<0x11, 0x12, 0x11>, "image_atomic_add">;
-defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimgopc<0x12, 0x13, 0x12>, "image_atomic_sub">;
-defm IMAGE_ATOMIC_RSUB : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x13>, "image_atomic_rsub">;
-defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimgopc<0x14>, "image_atomic_smin">;
-defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimgopc<0x15>, "image_atomic_umin">;
-defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimgopc<0x16>, "image_atomic_smax">;
-defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimgopc<0x17>, "image_atomic_umax">;
-defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimgopc<0x18>, "image_atomic_and">;
-defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimgopc<0x19>, "image_atomic_or">;
-defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimgopc<0x1a>, "image_atomic_xor">;
-defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimgopc<0x1b>, "image_atomic_inc">;
-defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimgopc<0x1c>, "image_atomic_dec">;
-defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>;
-defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>;
-defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>;
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <mimgopc<0x17, 0x0e>, "image_get_resinfo", 0, 1, 1>;
-defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x20>, AMDGPUSample>;
+defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimgopc<0x0a, 0x0f, 0x10, 0x0f>, "image_atomic_swap">;
+defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimgopc<0x0b, 0x10, 0x11, 0x10>, "image_atomic_cmpswap", 1>;
+defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimgopc<0x0c, 0x11, 0x12, 0x11>, "image_atomic_add">;
+defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimgopc<0x0d, 0x12, 0x13, 0x12>, "image_atomic_sub">;
+defm IMAGE_ATOMIC_RSUB : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, MIMG.NOP, 0x13>, "image_atomic_rsub">;
+defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimgopc<0x0e, 0x14>, "image_atomic_smin">;
+defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimgopc<0x0f, 0x15>, "image_atomic_umin">;
+defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimgopc<0x10, 0x16>, "image_atomic_smax">;
+defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimgopc<0x11, 0x17>, "image_atomic_umax">;
+defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimgopc<0x12, 0x18>, "image_atomic_and">;
+defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimgopc<0x13, 0x19>, "image_atomic_or">;
+defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimgopc<0x14, 0x1a>, "image_atomic_xor">;
+defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimgopc<0x15, 0x1b>, "image_atomic_inc">;
+defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimgopc<0x16, 0x1c>, "image_atomic_dec">;
+defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>;
+defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>;
+defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>;
+
+defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x20>, AMDGPUSample>;
let OtherPredicates = [HasExtendedImageInsts] in {
-defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x21>, AMDGPUSample_cl>;
-defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x22>, AMDGPUSample_d>;
-defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <mimgopc<0x23>, AMDGPUSample_d_cl>;
-defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0xa2>, AMDGPUSample_d, 0, 1>;
-defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0xa3>, AMDGPUSample_d_cl, 0, 1>;
-defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x24>, AMDGPUSample_l>;
-defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <mimgopc<0x25>, AMDGPUSample_b>;
-defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <mimgopc<0x26>, AMDGPUSample_b_cl>;
-defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x27>, AMDGPUSample_lz>;
-defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <mimgopc<0x28>, AMDGPUSample_c>;
-defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <mimgopc<0x29>, AMDGPUSample_c_cl>;
-defm IMAGE_SAMPLE_C_D : MIMG_Sampler <mimgopc<0x2a>, AMDGPUSample_c_d>;
-defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <mimgopc<0x2b>, AMDGPUSample_c_d_cl>;
-defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0xaa>, AMDGPUSample_c_d, 0, 1>;
-defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0xab>, AMDGPUSample_c_d_cl, 0, 1>;
-defm IMAGE_SAMPLE_C_L : MIMG_Sampler <mimgopc<0x2c>, AMDGPUSample_c_l>;
-defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <mimgopc<0x2d>, AMDGPUSample_c_b>;
-defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <mimgopc<0x2e>, AMDGPUSample_c_b_cl>;
-defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <mimgopc<0x2f>, AMDGPUSample_c_lz>;
-defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <mimgopc<0x30>, AMDGPUSample_o>;
-defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <mimgopc<0x31>, AMDGPUSample_cl_o>;
-defm IMAGE_SAMPLE_D_O : MIMG_Sampler <mimgopc<0x32>, AMDGPUSample_d_o>;
-defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <mimgopc<0x33>, AMDGPUSample_d_cl_o>;
-defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0xb2>, AMDGPUSample_d_o, 0, 1>;
-defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0xb3>, AMDGPUSample_d_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_L_O : MIMG_Sampler <mimgopc<0x34>, AMDGPUSample_l_o>;
-defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <mimgopc<0x35>, AMDGPUSample_b_o>;
-defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x36>, AMDGPUSample_b_cl_o>;
-defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <mimgopc<0x37>, AMDGPUSample_lz_o>;
-defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <mimgopc<0x38>, AMDGPUSample_c_o>;
-defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <mimgopc<0x39>, AMDGPUSample_c_cl_o>;
-defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <mimgopc<0x3a>, AMDGPUSample_c_d_o>;
-defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <mimgopc<0x3b>, AMDGPUSample_c_d_cl_o>;
-defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0xba>, AMDGPUSample_c_d_o, 0, 1>;
-defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <mimgopc<0x3c>, AMDGPUSample_c_l_o>;
-defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x3e>, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <mimgopc<0x3d>, AMDGPUSample_c_b_o>;
-defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <mimgopc<0x3f>, AMDGPUSample_c_lz_o>;
-defm IMAGE_GATHER4 : MIMG_Gather_WQM <mimgopc<0x40>, AMDGPUSample>;
-defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <mimgopc<0x41>, AMDGPUSample_cl>;
-defm IMAGE_GATHER4_L : MIMG_Gather <mimgopc<0x44>, AMDGPUSample_l>;
-defm IMAGE_GATHER4_B : MIMG_Gather_WQM <mimgopc<0x45>, AMDGPUSample_b>;
-defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <mimgopc<0x46>, AMDGPUSample_b_cl>;
-defm IMAGE_GATHER4_LZ : MIMG_Gather <mimgopc<0x47>, AMDGPUSample_lz>;
-defm IMAGE_GATHER4_C : MIMG_Gather_WQM <mimgopc<0x48>, AMDGPUSample_c>;
-defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <mimgopc<0x49>, AMDGPUSample_c_cl>;
-defm IMAGE_GATHER4_C_L : MIMG_Gather <mimgopc<0x4c>, AMDGPUSample_c_l>;
-defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <mimgopc<0x4d>, AMDGPUSample_c_b>;
-defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <mimgopc<0x4e>, AMDGPUSample_c_b_cl>;
-defm IMAGE_GATHER4_C_LZ : MIMG_Gather <mimgopc<0x4f>, AMDGPUSample_c_lz>;
-defm IMAGE_GATHER4_O : MIMG_Gather_WQM <mimgopc<0x50>, AMDGPUSample_o>;
-defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <mimgopc<0x51>, AMDGPUSample_cl_o>;
-defm IMAGE_GATHER4_L_O : MIMG_Gather <mimgopc<0x54>, AMDGPUSample_l_o>;
-defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <mimgopc<0x55>, AMDGPUSample_b_o>;
-defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <mimgopc<0x56>, AMDGPUSample_b_cl_o>;
-defm IMAGE_GATHER4_LZ_O : MIMG_Gather <mimgopc<0x57>, AMDGPUSample_lz_o>;
-defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <mimgopc<0x58>, AMDGPUSample_c_o>;
-defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <mimgopc<0x59>, AMDGPUSample_c_cl_o>;
-defm IMAGE_GATHER4_C_L_O : MIMG_Gather <mimgopc<0x5c>, AMDGPUSample_c_l_o>;
-defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <mimgopc<0x5d>, AMDGPUSample_c_b_o>;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <mimgopc<0x5e>, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <mimgopc<0x5f>, AMDGPUSample_c_lz_o>;
-//defm IMAGE_GATHER4H : MIMG_Gather_WQM <mimgopc<0x61>, ?>;
+defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x40, 0x21>, AMDGPUSample_cl>;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x1c, 0x22>, AMDGPUSample_d>;
+defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <mimgopc<0x41, 0x23>, AMDGPUSample_d_cl>;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x1d, 0x24>, AMDGPUSample_l>;
+defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <mimgopc<0x1e, 0x25>, AMDGPUSample_b>;
+defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <mimgopc<0x42, 0x26>, AMDGPUSample_b_cl>;
+defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x1f, 0x27>, AMDGPUSample_lz>;
+defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <mimgopc<0x20, 0x28>, AMDGPUSample_c>;
+defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <mimgopc<0x43, 0x29>, AMDGPUSample_c_cl>;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <mimgopc<0x21, 0x2a>, AMDGPUSample_c_d>;
+defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <mimgopc<0x44, 0x2b>, AMDGPUSample_c_d_cl>;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <mimgopc<0x22, 0x2c>, AMDGPUSample_c_l>;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <mimgopc<0x23, 0x2d>, AMDGPUSample_c_b>;
+defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <mimgopc<0x45, 0x2e>, AMDGPUSample_c_b_cl>;
+defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <mimgopc<0x24, 0x2f>, AMDGPUSample_c_lz>;
+defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <mimgopc<0x25, 0x30>, AMDGPUSample_o>;
+defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <mimgopc<0x46, 0x31>, AMDGPUSample_cl_o>;
+defm IMAGE_SAMPLE_D_O : MIMG_Sampler <mimgopc<0x26, 0x32>, AMDGPUSample_d_o>;
+defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <mimgopc<0x47, 0x33>, AMDGPUSample_d_cl_o>;
+defm IMAGE_SAMPLE_L_O : MIMG_Sampler <mimgopc<0x27, 0x34>, AMDGPUSample_l_o>;
+defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <mimgopc<0x28, 0x35>, AMDGPUSample_b_o>;
+defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x48, 0x36>, AMDGPUSample_b_cl_o>;
+defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <mimgopc<0x29, 0x37>, AMDGPUSample_lz_o>;
+defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <mimgopc<0x2a, 0x38>, AMDGPUSample_c_o>;
+defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <mimgopc<0x49, 0x39>, AMDGPUSample_c_cl_o>;
+defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <mimgopc<0x2b, 0x3a>, AMDGPUSample_c_d_o>;
+defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <mimgopc<0x4a, 0x3b>, AMDGPUSample_c_d_cl_o>;
+defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <mimgopc<0x2c, 0x3c>, AMDGPUSample_c_l_o>;
+defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x4b, 0x3e>, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <mimgopc<0x2d, 0x3d>, AMDGPUSample_c_b_o>;
+defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <mimgopc<0x2e, 0x3f>, AMDGPUSample_c_lz_o>;
+defm IMAGE_GATHER4 : MIMG_Gather_WQM <mimgopc<0x2f, 0x40>, AMDGPUSample>;
+defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <mimgopc<0x60, 0x41>, AMDGPUSample_cl>;
+defm IMAGE_GATHER4_L : MIMG_Gather <mimgopc<0x30, 0x44>, AMDGPUSample_l>;
+defm IMAGE_GATHER4_B : MIMG_Gather_WQM <mimgopc<0x31, 0x45>, AMDGPUSample_b>;
+defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <mimgopc<0x61, 0x46>, AMDGPUSample_b_cl>;
+defm IMAGE_GATHER4_LZ : MIMG_Gather <mimgopc<0x32, 0x47>, AMDGPUSample_lz>;
+defm IMAGE_GATHER4_C : MIMG_Gather_WQM <mimgopc<0x33, 0x48>, AMDGPUSample_c>;
+defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <mimgopc<0x62, 0x49>, AMDGPUSample_c_cl>;
+defm IMAGE_GATHER4_C_L : MIMG_Gather <mimgopc<0x63, 0x4c>, AMDGPUSample_c_l>;
+defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <mimgopc<0x64, 0x4d>, AMDGPUSample_c_b>;
+defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <mimgopc<0x65, 0x4e>, AMDGPUSample_c_b_cl>;
+defm IMAGE_GATHER4_C_LZ : MIMG_Gather <mimgopc<0x34, 0x4f>, AMDGPUSample_c_lz>;
+defm IMAGE_GATHER4_O : MIMG_Gather_WQM <mimgopc<0x35, 0x50>, AMDGPUSample_o>;
+defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x51>, AMDGPUSample_cl_o>;
+defm IMAGE_GATHER4_L_O : MIMG_Gather <mimgopc<MIMG.NOP, 0x54>, AMDGPUSample_l_o>;
+defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x55>, AMDGPUSample_b_o>;
+defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <mimgopc<MIMG.NOP, 0x56>, AMDGPUSample_b_cl_o>;
+defm IMAGE_GATHER4_LZ_O : MIMG_Gather <mimgopc<0x36, 0x57>, AMDGPUSample_lz_o>;
+defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x58>, AMDGPUSample_c_o>;
+defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x59>, AMDGPUSample_c_cl_o>;
+defm IMAGE_GATHER4_C_L_O : MIMG_Gather <mimgopc<MIMG.NOP, 0x5c>, AMDGPUSample_c_l_o>;
+defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x5d>, AMDGPUSample_c_b_o>;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x5e>, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <mimgopc<0x37, 0x5f>, AMDGPUSample_c_lz_o>;
+//defm IMAGE_GATHER4H : MIMG_Gather_WQM <mimgopc<0x90, 0x61>, ?>;
-defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">;
+defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x38, 0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">;
-defm IMAGE_SAMPLE_CD : MIMG_Sampler <mimgopc<0x68>, AMDGPUSample_cd>;
-defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <mimgopc<0x69>, AMDGPUSample_cd_cl>;
-defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <mimgopc<0x6a>, AMDGPUSample_c_cd>;
-defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <mimgopc<0x6b>, AMDGPUSample_c_cd_cl>;
-defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <mimgopc<0x6c>, AMDGPUSample_cd_o>;
-defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <mimgopc<0x6d>, AMDGPUSample_cd_cl_o>;
-defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <mimgopc<0x6e>, AMDGPUSample_c_cd_o>;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <mimgopc<0x6f>, AMDGPUSample_c_cd_cl_o>;
-defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<0xe8>, AMDGPUSample_cd, 0, 1>;
-defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<0xe9>, AMDGPUSample_cd_cl, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<0xea>, AMDGPUSample_c_cd, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<0xeb>, AMDGPUSample_c_cd_cl, 0, 1>;
-defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<0xec>, AMDGPUSample_cd_o, 0, 1>;
-defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xed>, AMDGPUSample_cd_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<0xee>, AMDGPUSample_c_cd_o, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_CD : MIMG_Sampler <mimgopc<MIMG.NOP, 0x68>, AMDGPUSample_cd>;
+defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, 0x69>, AMDGPUSample_cd_cl>;
+defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6a>, AMDGPUSample_c_cd>;
+defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6b>, AMDGPUSample_c_cd_cl>;
+defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6c>, AMDGPUSample_cd_o>;
+defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6d>, AMDGPUSample_cd_cl_o>;
+defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6e>, AMDGPUSample_c_cd_o>;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6f>, AMDGPUSample_c_cd_cl_o>;
} // End OtherPredicates = [HasExtendedImageInsts]
-//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
-//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
-let SubtargetPredicate = HasGFX10_AEncoding in
-defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<0x80>, "image_msaa_load", 1, 0, 0, 1>;
+let OtherPredicates = [HasExtendedImageInsts,HasG16] in {
+defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0x39, 0xa2>, AMDGPUSample_d, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0x5f, 0xa3>, AMDGPUSample_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0x3a, 0xaa>, AMDGPUSample_c_d, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0x54, 0xab>, AMDGPUSample_c_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0x3b, 0xb2>, AMDGPUSample_d_o, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x55, 0xb3>, AMDGPUSample_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0x3c, 0xba>, AMDGPUSample_c_d_o, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x56, 0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xe8>, AMDGPUSample_cd, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xe9>, AMDGPUSample_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xea>, AMDGPUSample_c_cd, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xeb>, AMDGPUSample_c_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xec>, AMDGPUSample_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xed>, AMDGPUSample_cd_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xee>, AMDGPUSample_c_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>;
+} // End OtherPredicates = [HasExtendedImageInsts,HasG16]
+
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", mimgopc<0x7e>>;
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", mimgopc<0x7f>>;
+
+let SubtargetPredicate = isGFX10Only, OtherPredicates = [HasGFX10_AEncoding] in
+defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<MIMG.NOP, 0x80>, "image_msaa_load", 1, 0, 0, 1>;
+
+let OtherPredicates = [HasGFX10_AEncoding] in
+defm IMAGE_MSAA_LOAD : MIMG_MSAA_Load <mimgopc<0x18, MIMG.NOP>, "image_msaa_load">;
+
+let OtherPredicates = [HasGFX10_AEncoding] in {
+defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x19, 0xe6>, "image_bvh_intersect_ray", 0, 0>;
+defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x19, 0xe6>, "image_bvh_intersect_ray", 0, 1>;
+defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 0>;
+defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 1>;
+} // End OtherPredicates = [HasGFX10_AEncoding]
-defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 0, 0>;
-defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 0, 1>;
-defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 1, 0>;
-defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 1, 1>;
+} // End let OtherPredicates = [HasImageInsts]
/********** ========================================= **********/
/********** Table of dimension-aware image intrinsics **********/
diff --git a/llvm/lib/Target/AMDGPU/R600.h b/llvm/lib/Target/AMDGPU/R600.h
index 2b483ae63da9..5dfbf8f1ef95 100644
--- a/llvm/lib/Target/AMDGPU/R600.h
+++ b/llvm/lib/Target/AMDGPU/R600.h
@@ -26,7 +26,7 @@ FunctionPass *createR600EmitClauseMarkers();
FunctionPass *createR600ClauseMergePass();
FunctionPass *createR600Packetizer();
FunctionPass *createR600ControlFlowFinalizer();
-FunctionPass *createAMDGPUCFGStructurizerPass();
+FunctionPass *createR600MachineCFGStructurizerPass();
FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel);
ModulePass *createR600OpenCLImageTypeLoweringPass();
diff --git a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
index c19e3c41485e..afcb6b4d65f8 100644
--- a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -111,7 +111,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
MCContext &Context = getObjFileLowering().getContext();
MCSectionELF *ConfigSection =
Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
- OutStreamer->SwitchSection(ConfigSection);
+ OutStreamer->switchSection(ConfigSection);
EmitProgramInfoR600(MF);
@@ -120,7 +120,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
if (isVerbose()) {
MCSectionELF *CommentSection =
Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
- OutStreamer->SwitchSection(CommentSection);
+ OutStreamer->switchSection(CommentSection);
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
OutStreamer->emitRawComment(
diff --git a/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 715fd69fc7ae..2b85df8ac6cf 100644
--- a/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer.
+/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative manner.
/// This pass is merging consecutive CFAlus where applicable.
/// It needs to be called after IfCvt for best results.
//===----------------------------------------------------------------------===//
@@ -15,6 +15,7 @@
#include "MCTargetDesc/R600MCTargetDesc.h"
#include "R600.h"
#include "R600Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
using namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 8a48a67b829c..4bf38a3c6ceb 100644
--- a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -16,6 +16,7 @@
#include "R600.h"
#include "R600MachineFunctionInfo.h"
#include "R600Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
#include <set>
using namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index b9ca7f928d56..ef67e5c937dc 100644
--- a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -17,6 +17,7 @@
#include "R600.h"
#include "R600Defines.h"
#include "R600Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
using namespace llvm;
@@ -327,9 +328,9 @@ char R600EmitClauseMarkers::ID = 0;
} // end anonymous namespace
INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
- "R600 Emit Clause Markters", false, false)
+ "R600 Emit Clause Markers", false, false)
INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
- "R600 Emit Clause Markters", false, false)
+ "R600 Emit Clause Markers", false, false)
FunctionPass *llvm::createR600EmitClauseMarkers() {
return new R600EmitClauseMarkers();
diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 194879fef53c..ef2d049f9175 100644
--- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -17,6 +17,8 @@
#include "R600.h"
#include "R600Defines.h"
#include "R600Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
using namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
index abd4086db62c..fd8cecab90da 100644
--- a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -8,6 +8,7 @@
#include "R600FrameLowering.h"
#include "R600Subtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
using namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index bd757e9e3d70..bf52f7830ad7 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -42,39 +42,26 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
computeRegisterProperties(Subtarget->getRegisterInfo());
// Legalize loads and stores to the private address space.
- setOperationAction(ISD::LOAD, MVT::i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+ setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom);
// EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
// spaces, so it is custom lowered to handle those where it isn't.
- for (MVT VT : MVT::integer_valuetypes()) {
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
-
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
-
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
- }
+ for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD})
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(Op, VT, MVT::i1, Promote);
+ setLoadExtAction(Op, VT, MVT::i8, Custom);
+ setLoadExtAction(Op, VT, MVT::i16, Custom);
+ }
// Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+ setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i32,
+ MVT::v2i1, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+ setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v4i32,
+ MVT::v4i1, Expand);
- setOperationAction(ISD::STORE, MVT::i8, Custom);
- setOperationAction(ISD::STORE, MVT::i32, Custom);
- setOperationAction(ISD::STORE, MVT::v2i32, Custom);
- setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+ setOperationAction(ISD::STORE, {MVT::i8, MVT::i32, MVT::v2i32, MVT::v4i32},
+ Custom);
setTruncStoreAction(MVT::i32, MVT::i8, Custom);
setTruncStoreAction(MVT::i32, MVT::i16, Custom);
@@ -96,55 +83,34 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
// Set condition code actions
- setCondCodeAction(ISD::SETO, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
- setCondCodeAction(ISD::SETLT, MVT::f32, Expand);
- setCondCodeAction(ISD::SETLE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
- setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
- setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
- setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
+ setCondCodeAction({ISD::SETO, ISD::SETUO, ISD::SETLT, ISD::SETLE, ISD::SETOLT,
+ ISD::SETOLE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGE,
+ ISD::SETUGT, ISD::SETULT, ISD::SETULE},
+ MVT::f32, Expand);
- setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
- setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
- setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
- setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
+ setCondCodeAction({ISD::SETLE, ISD::SETLT, ISD::SETULE, ISD::SETULT},
+ MVT::i32, Expand);
- setOperationAction(ISD::FCOS, MVT::f32, Custom);
- setOperationAction(ISD::FSIN, MVT::f32, Custom);
+ setOperationAction({ISD::FCOS, ISD::FSIN}, MVT::f32, Custom);
- setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
- setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
+ setOperationAction(ISD::SETCC, {MVT::v4i32, MVT::v2i32}, Expand);
- setOperationAction(ISD::BR_CC, MVT::i32, Expand);
- setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+ setOperationAction(ISD::BR_CC, {MVT::i32, MVT::f32}, Expand);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::FSUB, MVT::f32, Expand);
- setOperationAction(ISD::FCEIL, MVT::f64, Custom);
- setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
- setOperationAction(ISD::FRINT, MVT::f64, Custom);
- setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
+ setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR},
+ MVT::f64, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+ setOperationAction(ISD::SELECT_CC, {MVT::f32, MVT::i32}, Custom);
- setOperationAction(ISD::SETCC, MVT::i32, Expand);
- setOperationAction(ISD::SETCC, MVT::f32, Expand);
- setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+ setOperationAction(ISD::SETCC, {MVT::i32, MVT::f32}, Expand);
+ setOperationAction({ISD::FP_TO_UINT, ISD::FP_TO_SINT}, {MVT::i1, MVT::i64},
+ Custom);
- setOperationAction(ISD::SELECT, MVT::i32, Expand);
- setOperationAction(ISD::SELECT, MVT::f32, Expand);
- setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
- setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+ setOperationAction(ISD::SELECT, {MVT::i32, MVT::f32, MVT::v2i32, MVT::v4i32},
+ Expand);
// ADD, SUB overflow.
// TODO: turn these into Legal?
@@ -158,56 +124,43 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
if (!Subtarget->hasBFE())
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i1, MVT::v4i1}, Expand);
if (!Subtarget->hasBFE())
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i8, MVT::v4i8}, Expand);
if (!Subtarget->hasBFE())
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v4i16}, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i32, MVT::v4i32}, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT,
+ {MVT::v2i32, MVT::v2f32, MVT::v4i32, MVT::v4f32}, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT,
+ {MVT::v2i32, MVT::v2f32, MVT::v4i32, MVT::v4f32}, Custom);
// We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
// to be Legal/Custom in order to avoid library calls.
- setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
- setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
- setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+ setOperationAction({ISD::SHL_PARTS, ISD::SRL_PARTS, ISD::SRA_PARTS}, MVT::i32,
+ Custom);
- if (!Subtarget->hasFMA()) {
- setOperationAction(ISD::FMA, MVT::f32, Expand);
- setOperationAction(ISD::FMA, MVT::f64, Expand);
- }
+ if (!Subtarget->hasFMA())
+ setOperationAction(ISD::FMA, {MVT::f32, MVT::f64}, Expand);
// FIXME: May need no denormals check
setOperationAction(ISD::FMAD, MVT::f32, Legal);
- if (!Subtarget->hasBFI()) {
+ if (!Subtarget->hasBFI())
// fcopysign can be done in a single instruction with BFI.
- setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
- }
+ setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
if (!Subtarget->hasBCNT(32))
setOperationAction(ISD::CTPOP, MVT::i32, Expand);
@@ -229,30 +182,22 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
- for (MVT VT : ScalarIntVTs) {
- setOperationAction(ISD::ADDC, VT, Expand);
- setOperationAction(ISD::SUBC, VT, Expand);
- setOperationAction(ISD::ADDE, VT, Expand);
- setOperationAction(ISD::SUBE, VT, Expand);
- }
+ for (MVT VT : ScalarIntVTs)
+ setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT,
+ Expand);
// LLVM will expand these to atomic_cmp_swap(0)
// and atomic_swap, respectively.
- setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand);
- setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
+ setOperationAction({ISD::ATOMIC_LOAD, ISD::ATOMIC_STORE}, MVT::i32, Expand);
// We need to custom lower some of the intrinsics
- setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction({ISD::INTRINSIC_VOID, ISD::INTRINSIC_WO_CHAIN}, MVT::Other,
+ Custom);
setSchedulingPreference(Sched::Source);
- setTargetDAGCombine(ISD::FP_ROUND);
- setTargetDAGCombine(ISD::FP_TO_SINT);
- setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
- setTargetDAGCombine(ISD::SELECT_CC);
- setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
- setTargetDAGCombine(ISD::LOAD);
+ setTargetDAGCombine({ISD::FP_ROUND, ISD::FP_TO_SINT, ISD::EXTRACT_VECTOR_ELT,
+ ISD::SELECT_CC, ISD::INSERT_VECTOR_ELT, ISD::LOAD});
}
static inline bool isEOP(MachineBasicBlock::iterator I) {
@@ -995,7 +940,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
/// LLVM generates byte-addressed pointers. For indirect addressing, we need to
/// convert these pointers to a register index. Each register holds
/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
-/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
+/// \p StackWidth, which tells us how many of the 4 sub-registers will be used
/// for indirect addressing.
SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
unsigned StackWidth,
@@ -1100,7 +1045,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
DAG.getConstant(3, DL, MVT::i32));
- // TODO: Contrary to the name of the functiom,
+ // TODO: Contrary to the name of the function,
// it also handles sub i32 non-truncating stores (like i1)
SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
Store->getValue());
@@ -1163,9 +1108,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
// TODO: can the chain be replaced without creating a new store?
SDValue NewStore = DAG.getTruncStore(
- NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(),
- MemVT, StoreNode->getAlignment(),
- StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo());
+ NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT,
+ StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(),
+ StoreNode->getAAInfo());
StoreNode = cast<StoreSDNode>(NewStore);
}
@@ -1417,7 +1362,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
SDValue NewLoad = DAG.getExtLoad(
ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
- LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
+ LoadNode->getAlign(), LoadNode->getMemOperand()->getFlags());
SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
DAG.getValueType(MemVT));
@@ -1610,7 +1555,7 @@ static SDValue CompactSwizzlableVector(
if (NewBldVec[i].isUndef())
// We mask write here to teach later passes that the ith element of this
// vector is undef. Thus we can use it to reduce 128 bits reg usage,
- // break false dependencies and additionnaly make assembly easier to read.
+ // break false dependencies and additionally make assembly easier to read.
RemapSwizzle[i] = 7; // SEL_MASK_WRITE
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
if (C->isZero()) {
@@ -1714,7 +1659,7 @@ SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block,
if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode))
return SDValue();
- if (LoadNode->getAlignment() < 4)
+ if (LoadNode->getAlign() < Align(4))
return SDValue();
int ConstantBlock = ConstantAddressBlock(Block);
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index aec8b1ae4837..d04ec6490aae 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -18,6 +18,7 @@
#include "R600Defines.h"
#include "R600Subtarget.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
using namespace llvm;
@@ -1469,21 +1470,3 @@ void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand,
FlagOp.setImm(InstFlags);
}
}
-
-unsigned R600InstrInfo::getAddressSpaceForPseudoSourceKind(
- unsigned Kind) const {
- switch (Kind) {
- case PseudoSourceValue::Stack:
- case PseudoSourceValue::FixedStack:
- return AMDGPUAS::PRIVATE_ADDRESS;
- case PseudoSourceValue::ConstantPool:
- case PseudoSourceValue::GOT:
- case PseudoSourceValue::JumpTable:
- case PseudoSourceValue::GlobalValueCallEntry:
- case PseudoSourceValue::ExternalSymbolCallEntry:
- case PseudoSourceValue::TargetCustom:
- return AMDGPUAS::CONSTANT_ADDRESS;
- }
-
- llvm_unreachable("Invalid pseudo source kind");
-}
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index bc8a4786df77..f720e4656348 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -320,9 +320,6 @@ public:
bool isRegisterLoad(const MachineInstr &MI) const {
return get(MI.getOpcode()).TSFlags & R600InstrFlags::REGISTER_LOAD;
}
-
- unsigned getAddressSpaceForPseudoSourceKind(
- unsigned Kind) const override;
};
namespace R600 {
diff --git a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
index 1736c078eb83..0a96c643d9bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
@@ -1,4 +1,4 @@
-//===- AMDILCFGStructurizer.cpp - CFG Structurizer ------------------------===//
+//===- R600MachineCFGStructurizer.cpp - CFG Structurizer ------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -10,6 +10,7 @@
#include "R600.h"
#include "R600RegisterInfo.h"
#include "R600Subtarget.h"
+#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SCCIterator.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -42,7 +43,7 @@ STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions");
namespace llvm {
-void initializeAMDGPUCFGStructurizerPass(PassRegistry &);
+void initializeR600MachineCFGStructurizerPass(PassRegistry &);
} // end namespace llvm
@@ -89,7 +90,7 @@ public:
//
//===----------------------------------------------------------------------===//
-class AMDGPUCFGStructurizer : public MachineFunctionPass {
+class R600MachineCFGStructurizer : public MachineFunctionPass {
public:
using MBBVector = SmallVector<MachineBasicBlock *, 32>;
using MBBInfoMap = std::map<MachineBasicBlock *, BlockInformation *>;
@@ -103,8 +104,8 @@ public:
static char ID;
- AMDGPUCFGStructurizer() : MachineFunctionPass(ID) {
- initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
+ R600MachineCFGStructurizer() : MachineFunctionPass(ID) {
+ initializeR600MachineCFGStructurizerPass(*PassRegistry::getPassRegistry());
}
StringRef getPassName() const override {
@@ -317,16 +318,16 @@ private:
} // end anonymous namespace
-char AMDGPUCFGStructurizer::ID = 0;
+char R600MachineCFGStructurizer::ID = 0;
-int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
+int R600MachineCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
if (It == BlockInfoMap.end())
return INVALIDSCCNUM;
return (*It).second->SccNum;
}
-MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
+MachineBasicBlock *R600MachineCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
const {
LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
if (It == LLInfoMap.end())
@@ -334,7 +335,7 @@ MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
return (*It).second;
}
-bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
+bool R600MachineCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
MachineLoop *LoopRep = MLI->getLoopFor(MBB);
if (!LoopRep)
return false;
@@ -342,14 +343,14 @@ bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
return MBB->isSuccessor(LoopHeader);
}
-bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
+bool R600MachineCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
if (It == BlockInfoMap.end())
return false;
return (*It).second->IsRetired;
}
-bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
+bool R600MachineCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
MachineLoop *LoopRep = MLI->getLoopFor(MBB);
while (LoopRep && LoopRep->getHeader() == MBB) {
MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
@@ -362,7 +363,7 @@ bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
return false;
}
-AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
+R600MachineCFGStructurizer::PathToKind R600MachineCFGStructurizer::singlePathTo(
MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
bool AllowSideEntry) const {
assert(DstMBB);
@@ -380,7 +381,7 @@ AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
return Not_SinglePath;
}
-int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
+int R600MachineCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
MBBVector::const_iterator E) const {
int Count = 0;
while (It != E) {
@@ -391,7 +392,7 @@ int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
return Count;
}
-bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
+bool R600MachineCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
unsigned BlockSizeThreshold = 30;
unsigned CloneInstrThreshold = 100;
bool MultiplePreds = MBB && (MBB->pred_size() > 1);
@@ -403,7 +404,7 @@ bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
(BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
}
-void AMDGPUCFGStructurizer::reversePredicateSetter(
+void R600MachineCFGStructurizer::reversePredicateSetter(
MachineBasicBlock::iterator I, MachineBasicBlock &MBB) {
assert(I.isValid() && "Expected valid iterator");
for (;; --I) {
@@ -430,7 +431,7 @@ void AMDGPUCFGStructurizer::reversePredicateSetter(
}
}
-void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
+void R600MachineCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
int NewOpcode, const DebugLoc &DL) {
MachineInstr *MI =
MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
@@ -439,7 +440,7 @@ void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
SHOWNEWINSTR(MI);
}
-MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
+MachineInstr *R600MachineCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
int NewOpcode,
const DebugLoc &DL) {
MachineInstr *MI =
@@ -452,7 +453,7 @@ MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
return MI;
}
-MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
+MachineInstr *R600MachineCFGStructurizer::insertInstrBefore(
MachineBasicBlock::iterator I, int NewOpcode) {
MachineInstr *OldMI = &(*I);
MachineBasicBlock *MBB = OldMI->getParent();
@@ -464,7 +465,7 @@ MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
return NewMBB;
}
-void AMDGPUCFGStructurizer::insertCondBranchBefore(
+void R600MachineCFGStructurizer::insertCondBranchBefore(
MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) {
MachineInstr *OldMI = &(*I);
MachineBasicBlock *MBB = OldMI->getParent();
@@ -477,7 +478,7 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(
//erase later oldInstr->eraseFromParent();
}
-void AMDGPUCFGStructurizer::insertCondBranchBefore(
+void R600MachineCFGStructurizer::insertCondBranchBefore(
MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode,
int RegNum, const DebugLoc &DL) {
MachineFunction *MF = blk->getParent();
@@ -488,7 +489,7 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(
SHOWNEWINSTR(NewInstr);
}
-int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
+int R600MachineCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
switch(OldOpcode) {
case R600::JUMP_COND:
case R600::JUMP: return R600::IF_PREDICATE_SET;
@@ -499,7 +500,7 @@ int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
return -1;
}
-int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
+int R600MachineCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
switch(OldOpcode) {
case R600::JUMP_COND:
case R600::JUMP: return R600::IF_PREDICATE_SET;
@@ -510,7 +511,7 @@ int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
return -1;
}
-int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
+int R600MachineCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
switch(OldOpcode) {
case R600::JUMP_COND:
case R600::JUMP: return R600::CONTINUE_LOGICALNZ_i32;
@@ -519,7 +520,7 @@ int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
return -1;
}
-int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
+int R600MachineCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
switch(OldOpcode) {
case R600::JUMP_COND:
case R600::JUMP: return R600::CONTINUE_LOGICALZ_i32;
@@ -528,17 +529,17 @@ int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
return -1;
}
-MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
+MachineBasicBlock *R600MachineCFGStructurizer::getTrueBranch(MachineInstr *MI) {
return MI->getOperand(0).getMBB();
}
-void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
+void R600MachineCFGStructurizer::setTrueBranch(MachineInstr *MI,
MachineBasicBlock *MBB) {
MI->getOperand(0).setMBB(MBB);
}
MachineBasicBlock *
-AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
+R600MachineCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
MachineInstr *MI) {
assert(MBB->succ_size() == 2);
MachineBasicBlock *TrueBranch = getTrueBranch(MI);
@@ -548,7 +549,7 @@ AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
return (*It == TrueBranch) ? *Next : *It;
}
-bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
+bool R600MachineCFGStructurizer::isCondBranch(MachineInstr *MI) {
switch (MI->getOpcode()) {
case R600::JUMP_COND:
case R600::BRANCH_COND_i32:
@@ -559,7 +560,7 @@ bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
return false;
}
-bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
+bool R600MachineCFGStructurizer::isUncondBranch(MachineInstr *MI) {
switch (MI->getOpcode()) {
case R600::JUMP:
case R600::BRANCH:
@@ -570,7 +571,7 @@ bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
return false;
}
-DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
+DebugLoc R600MachineCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
//get DebugLoc from the first MachineBasicBlock instruction with debug info
DebugLoc DL;
for (MachineInstr &MI : *MBB)
@@ -579,7 +580,7 @@ DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
return DL;
}
-MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
+MachineInstr *R600MachineCFGStructurizer::getNormalBlockBranchInstr(
MachineBasicBlock *MBB) {
MachineBasicBlock::reverse_iterator It = MBB->rbegin();
MachineInstr *MI = &*It;
@@ -588,7 +589,7 @@ MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
return nullptr;
}
-MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
+MachineInstr *R600MachineCFGStructurizer::getLoopendBlockBranchInstr(
MachineBasicBlock *MBB) {
for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
It != E; ++It) {
@@ -604,7 +605,7 @@ MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
return nullptr;
}
-MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
+MachineInstr *R600MachineCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
MachineBasicBlock::reverse_iterator It = MBB->rbegin();
if (It != MBB->rend()) {
MachineInstr *instr = &(*It);
@@ -614,7 +615,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
return nullptr;
}
-bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
+bool R600MachineCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
MachineInstr *MI = getReturnInstr(MBB);
bool IsReturn = MBB->succ_empty();
if (MI)
@@ -625,13 +626,13 @@ bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
return IsReturn;
}
-void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
+void R600MachineCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
MachineBasicBlock *SrcMBB) {
for (MachineBasicBlock *Succ : SrcMBB->successors())
DstMBB->addSuccessor(Succ); // *iter's predecessor is also taken care of
}
-MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
+MachineBasicBlock *R600MachineCFGStructurizer::clone(MachineBasicBlock *MBB) {
MachineFunction *Func = MBB->getParent();
MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
Func->push_back(NewMBB); //insert to function
@@ -640,7 +641,7 @@ MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
return NewMBB;
}
-void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
+void R600MachineCFGStructurizer::replaceInstrUseOfBlockWith(
MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
MachineBasicBlock *NewBlk) {
MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
@@ -649,7 +650,7 @@ void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
setTrueBranch(BranchMI, NewBlk);
}
-void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
+void R600MachineCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
assert((!MBB->getParent()->getJumpTableInfo()
|| MBB->getParent()->getJumpTableInfo()->isEmpty())
&& "found a jump table");
@@ -677,12 +678,12 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
// blocks in the jump table with the entryBlk //}
}
-bool AMDGPUCFGStructurizer::prepare() {
+bool R600MachineCFGStructurizer::prepare() {
bool Changed = false;
//FIXME: if not reducible flow graph, make it so ???
- LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
+ LLVM_DEBUG(dbgs() << "R600MachineCFGStructurizer::prepare\n";);
orderBlocks(FuncRep);
@@ -719,9 +720,9 @@ bool AMDGPUCFGStructurizer::prepare() {
return Changed;
}
-bool AMDGPUCFGStructurizer::run() {
+bool R600MachineCFGStructurizer::run() {
//Assume reducible CFG...
- LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
+ LLVM_DEBUG(dbgs() << "R600MachineCFGStructurizer::run\n");
#ifdef STRESSTEST
//Use the worse block ordering to test the algorithm.
@@ -740,6 +741,7 @@ bool AMDGPUCFGStructurizer::run() {
++NumIter;
LLVM_DEBUG(dbgs() << "numIter = " << NumIter
<< ", numRemaintedBlk = " << NumRemainedBlk << "\n";);
+ (void)NumIter;
SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
OrderedBlks.begin();
@@ -780,6 +782,7 @@ bool AMDGPUCFGStructurizer::run() {
LLVM_DEBUG(dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
<< ", sccNumIter = " << SccNumIter;
dbgs() << "doesn't make any progress\n";);
+ (void)SccNumIter;
ContNextScc = true;
} else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
SccNumBlk = sccRemainedNumBlk;
@@ -842,7 +845,7 @@ bool AMDGPUCFGStructurizer::run() {
return true;
}
-void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
+void R600MachineCFGStructurizer::orderBlocks(MachineFunction *MF) {
int SccNum = 0;
for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
++It, ++SccNum) {
@@ -861,7 +864,7 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
}
}
-int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
+int R600MachineCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
int NumMatch = 0;
int CurMatch;
@@ -876,7 +879,7 @@ int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
return NumMatch;
}
-int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
+int R600MachineCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
int NumMatch = 0;
NumMatch += loopendPatternMatch();
NumMatch += serialPatternMatch(MBB);
@@ -884,7 +887,7 @@ int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
return NumMatch;
}
-int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
+int R600MachineCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
if (MBB->succ_size() != 1)
return 0;
@@ -897,7 +900,7 @@ int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
return 1;
}
-int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
+int R600MachineCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
//two edges
if (MBB->succ_size() != 2)
return 0;
@@ -975,7 +978,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
return 1 + Cloned + NumMatch;
}
-int AMDGPUCFGStructurizer::loopendPatternMatch() {
+int R600MachineCFGStructurizer::loopendPatternMatch() {
std::deque<MachineLoop *> NestedLoops;
for (auto &It: *MLI)
for (MachineLoop *ML : depth_first(It))
@@ -1000,7 +1003,7 @@ int AMDGPUCFGStructurizer::loopendPatternMatch() {
return Num;
}
-int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
+int R600MachineCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
MachineBasicBlock *LoopHeader = LoopRep->getHeader();
MBBVector ExitingMBBs;
LoopRep->getExitingBlocks(ExitingMBBs);
@@ -1041,7 +1044,7 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
return 1;
}
-bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
+bool R600MachineCFGStructurizer::isSameloopDetachedContbreak(
MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
if (Src1MBB->succ_empty()) {
MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
@@ -1058,7 +1061,7 @@ bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
return false;
}
-int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
+int R600MachineCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
if (Num == 0) {
@@ -1069,7 +1072,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
return Num;
}
-int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+int R600MachineCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
int Num = 0;
MachineBasicBlock *DownBlk;
@@ -1107,7 +1110,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
}
#ifndef NDEBUG
-void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
+void R600MachineCFGStructurizer::showImproveSimpleJumpintoIf(
MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
dbgs() << "head = BB" << HeadMBB->getNumber()
@@ -1150,7 +1153,7 @@ void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
}
#endif
-int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+int R600MachineCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
MachineBasicBlock **LandMBBPtr) {
bool MigrateTrue = false;
@@ -1322,7 +1325,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
return NumNewBlk;
}
-void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
+void R600MachineCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
MachineBasicBlock *SrcMBB) {
LLVM_DEBUG(dbgs() << "serialPattern BB" << DstMBB->getNumber() << " <= BB"
<< SrcMBB->getNumber() << "\n";);
@@ -1336,7 +1339,7 @@ void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
retireBlock(SrcMBB);
}
-void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
+void R600MachineCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
assert (TrueMBB);
@@ -1392,7 +1395,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
MBB->addSuccessor(LandMBB);
}
-void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
+void R600MachineCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
MachineBasicBlock *LandMBB) {
LLVM_DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
<< " land = BB" << LandMBB->getNumber() << "\n";);
@@ -1402,7 +1405,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
DstBlk->replaceSuccessor(DstBlk, LandMBB);
}
-void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+void R600MachineCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
MachineBasicBlock *LandMBB) {
LLVM_DEBUG(dbgs() << "loopbreakPattern exiting = BB"
<< ExitingMBB->getNumber() << " land = BB"
@@ -1423,7 +1426,7 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
ExitingMBB->removeSuccessor(LandMBB, true);
}
-void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+void R600MachineCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
MachineBasicBlock *ContMBB) {
LLVM_DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
<< ContingMBB->getNumber() << ", cont = BB"
@@ -1466,7 +1469,7 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
}
}
-int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+int R600MachineCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
int Cloned = 0;
assert(PreMBB->isSuccessor(SrcMBB));
@@ -1485,10 +1488,9 @@ int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
}
MachineBasicBlock *
-AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
+R600MachineCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
MachineBasicBlock *PredMBB) {
- assert(PredMBB->isSuccessor(MBB) &&
- "succBlk is not a prececessor of curBlk");
+ assert(PredMBB->isSuccessor(MBB) && "succBlk is not a predecessor of curBlk");
MachineBasicBlock *CloneMBB = clone(MBB); //clone instructions
replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
@@ -1510,7 +1512,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
return CloneMBB;
}
-void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
+void R600MachineCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
MachineBasicBlock::iterator SpliceEnd;
//look for the input branchinstr, not the AMDGPU branchinstr
@@ -1535,7 +1537,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
}
MachineBasicBlock *
-AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
+R600MachineCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
MachineBasicBlock *LoopHeader = LoopRep->getHeader();
MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
@@ -1555,7 +1557,7 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
return nullptr;
}
-void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
+void R600MachineCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
MachineInstr *BranchMI;
// I saw two unconditional branch in one basic block in example
@@ -1567,7 +1569,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
}
}
-void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
+void R600MachineCFGStructurizer::removeRedundantConditionalBranch(
MachineBasicBlock *MBB) {
if (MBB->succ_size() != 2)
return;
@@ -1584,7 +1586,7 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
MBB->removeSuccessor(MBB1, true);
}
-void AMDGPUCFGStructurizer::addDummyExitBlock(
+void R600MachineCFGStructurizer::addDummyExitBlock(
SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
FuncRep->push_back(DummyExitBlk); //insert to function
@@ -1600,12 +1602,12 @@ void AMDGPUCFGStructurizer::addDummyExitBlock(
SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
}
-void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
+void R600MachineCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
while (MBB->succ_size())
MBB->removeSuccessor(*MBB->succ_begin());
}
-void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
+void R600MachineCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
int SccNum) {
BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
if (!srcBlkInfo)
@@ -1613,7 +1615,7 @@ void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
srcBlkInfo->SccNum = SccNum;
}
-void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
+void R600MachineCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
LLVM_DEBUG(dbgs() << "Retiring BB" << MBB->getNumber() << "\n";);
BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
@@ -1625,14 +1627,14 @@ void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
assert(MBB->succ_empty() && MBB->pred_empty() && "can't retire block yet");
}
-INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
+INITIALIZE_PASS_BEGIN(R600MachineCFGStructurizer, "amdgpustructurizer",
"AMDGPU CFG Structurizer", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
+INITIALIZE_PASS_END(R600MachineCFGStructurizer, "amdgpustructurizer",
"AMDGPU CFG Structurizer", false, false)
-FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
- return new AMDGPUCFGStructurizer();
+FunctionPass *llvm::createR600MachineCFGStructurizerPass() {
+ return new R600MachineCFGStructurizer();
}
diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index fbe2a1cd9fba..59e274787590 100644
--- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -207,7 +207,7 @@ public:
return !ARDef || !ARUse;
}
- // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+ // isLegalToPruneDependencies - Is it legal to prune dependency between SUI
// and SUJ.
bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override {
return false;
diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
index 20c1ce7266dd..d8f061054904 100644
--- a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
@@ -27,8 +27,6 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
: R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), AMDGPUSubtarget(TT),
InstrInfo(*this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
- FMA(false), CaymanISA(false), CFALUBug(false), HasVertexCache(false),
- R600ALUInst(false), FP64(false), TexVTXClauseSize(0), Gen(R600),
TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
InstrItins(getInstrItineraryForCPU(GPU)) {}
diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.h b/llvm/lib/Target/AMDGPU/R600Subtarget.h
index 92d559b1f8e6..c3d002f29272 100644
--- a/llvm/lib/Target/AMDGPU/R600Subtarget.h
+++ b/llvm/lib/Target/AMDGPU/R600Subtarget.h
@@ -31,14 +31,14 @@ class R600Subtarget final : public R600GenSubtargetInfo,
private:
R600InstrInfo InstrInfo;
R600FrameLowering FrameLowering;
- bool FMA;
- bool CaymanISA;
- bool CFALUBug;
- bool HasVertexCache;
- bool R600ALUInst;
- bool FP64;
- short TexVTXClauseSize;
- Generation Gen;
+ bool FMA = false;
+ bool CaymanISA = false;
+ bool CFALUBug = false;
+ bool HasVertexCache = false;
+ bool R600ALUInst = false;
+ bool FP64 = false;
+ short TexVTXClauseSize = 0;
+ Generation Gen = R600;
R600TargetLowering TLInfo;
InstrItineraryData InstrItins;
SelectionDAGTargetInfo TSInfo;
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
index 39dad45425fc..76bb0f65ef69 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@@ -83,7 +83,7 @@ R600TargetMachine::getSubtargetImpl(const Function &F) const {
}
TargetTransformInfo
-R600TargetMachine::getTargetTransformInfo(const Function &F) {
+R600TargetMachine::getTargetTransformInfo(const Function &F) const {
return TargetTransformInfo(R600TTIImpl(this, F));
}
@@ -131,7 +131,7 @@ void R600PassConfig::addPreSched2() {
}
void R600PassConfig::addPreEmitPass() {
- addPass(createAMDGPUCFGStructurizerPass());
+ addPass(createR600MachineCFGStructurizerPass());
addPass(createR600ExpandSpecialInstrsPass());
addPass(&FinalizeMachineBundlesID);
addPass(createR600Packetizer());
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.h b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
index 0ccbca3c68b1..8d20841292b9 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
@@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// The AMDGPU TargetMachine interface definition for hw codgen targets.
+/// The AMDGPU TargetMachine interface definition for hw codegen targets.
//
//===----------------------------------------------------------------------===//
@@ -38,7 +38,7 @@ public:
const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override;
- TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+ TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
bool isMachineVerifierClean() const override { return false; }
};
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index b81fac36fc95..afd2a38b11ec 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -73,19 +73,19 @@ class SIAnnotateControlFlow : public FunctionPass {
bool hasKill(const BasicBlock *BB);
- void eraseIfUnused(PHINode *Phi);
+ bool eraseIfUnused(PHINode *Phi);
- void openIf(BranchInst *Term);
+ bool openIf(BranchInst *Term);
- void insertElse(BranchInst *Term);
+ bool insertElse(BranchInst *Term);
Value *
handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L,
BranchInst *Term);
- void handleLoop(BranchInst *Term);
+ bool handleLoop(BranchInst *Term);
- void closeControlFlow(BasicBlock *BB);
+ bool closeControlFlow(BasicBlock *BB);
public:
static char ID;
@@ -193,31 +193,34 @@ bool SIAnnotateControlFlow::hasKill(const BasicBlock *BB) {
return false;
}
-// Erase "Phi" if it is not used any more
-void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
- if (RecursivelyDeleteDeadPHINode(Phi)) {
+// Erase "Phi" if it is not used any more. Return true if any change was made.
+bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
+ bool Changed = RecursivelyDeleteDeadPHINode(Phi);
+ if (Changed)
LLVM_DEBUG(dbgs() << "Erased unused condition phi\n");
- }
+ return Changed;
}
/// Open a new "If" block
-void SIAnnotateControlFlow::openIf(BranchInst *Term) {
+bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
if (isUniform(Term))
- return;
+ return false;
Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+ return true;
}
/// Close the last "If" block and open a new "Else" block
-void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
+bool SIAnnotateControlFlow::insertElse(BranchInst *Term) {
if (isUniform(Term)) {
- return;
+ return false;
}
Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+ return true;
}
/// Recursively handle the condition leading to a loop
@@ -255,14 +258,14 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
}
/// Handle a back edge (loop)
-void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
+bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
if (isUniform(Term))
- return;
+ return false;
BasicBlock *BB = Term->getParent();
llvm::Loop *L = LI->getLoopFor(BB);
if (!L)
- return;
+ return false;
BasicBlock *Target = Term->getSuccessor(1);
PHINode *Broken = PHINode::Create(IntMask, 0, "phi.broken", &Target->front());
@@ -286,10 +289,12 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
push(Term->getSuccessor(0), Arg);
+
+ return true;
}
/// Close the last opened control flow
-void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
+bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
llvm::Loop *L = LI->getLoopFor(BB);
assert(Stack.back().first == BB);
@@ -322,6 +327,8 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
}
CallInst::Create(EndCf, Exec, "", FirstInsertionPt);
}
+
+ return true;
}
/// Annotate the control flow with intrinsics so the backend can
@@ -333,6 +340,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
+ bool Changed = false;
initialize(*F.getParent(), TM.getSubtarget<GCNSubtarget>(F));
for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
E = df_end(&F.getEntryBlock()); I != E; ++I) {
@@ -341,32 +349,32 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
if (!Term || Term->isUnconditional()) {
if (isTopOfStack(BB))
- closeControlFlow(BB);
+ Changed |= closeControlFlow(BB);
continue;
}
if (I.nodeVisited(Term->getSuccessor(1))) {
if (isTopOfStack(BB))
- closeControlFlow(BB);
+ Changed |= closeControlFlow(BB);
if (DT->dominates(Term->getSuccessor(1), BB))
- handleLoop(Term);
+ Changed |= handleLoop(Term);
continue;
}
if (isTopOfStack(BB)) {
PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
if (Phi && Phi->getParent() == BB && isElse(Phi) && !hasKill(BB)) {
- insertElse(Term);
- eraseIfUnused(Phi);
+ Changed |= insertElse(Term);
+ Changed |= eraseIfUnused(Phi);
continue;
}
- closeControlFlow(BB);
+ Changed |= closeControlFlow(BB);
}
- openIf(Term);
+ Changed |= openIf(Term);
}
if (!Stack.empty()) {
@@ -374,7 +382,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
report_fatal_error("failed to annotate CFG");
}
- return true;
+ return Changed;
}
/// Create the annotation pass
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 107ee5ed5532..85930312352b 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -63,6 +63,12 @@ enum : uint64_t {
VGPRSpill = 1 << 24,
SGPRSpill = 1 << 25,
+ // LDSDIR instruction format.
+ LDSDIR = 1 << 26,
+
+ // VINTERP instruction format.
+ VINTERP = 1 << 27,
+
// High bits - other information.
VM_CNT = UINT64_C(1) << 32,
EXP_CNT = UINT64_C(1) << 33,
@@ -120,7 +126,10 @@ enum : uint64_t {
IsAtomicNoRet = UINT64_C(1) << 57,
// Atomic with return.
- IsAtomicRet = UINT64_C(1) << 58
+ IsAtomicRet = UINT64_C(1) << 58,
+
+ // Is a WMMA instruction.
+ IsWMMA = UINT64_C(1) << 59,
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -258,9 +267,10 @@ namespace AMDGPUAsmVariants {
VOP3 = 1,
SDWA = 2,
SDWA9 = 3,
- DPP = 4
+ DPP = 4,
+ VOP3_DPP = 5
};
-}
+} // namespace AMDGPUAsmVariants
namespace AMDGPU {
namespace EncValues { // Encoding values of enum9/8/7 operands
@@ -280,7 +290,8 @@ enum : unsigned {
INLINE_FLOATING_C_MAX = 248,
LITERAL_CONST = 255,
VGPR_MIN = 256,
- VGPR_MAX = 511
+ VGPR_MAX = 511,
+ IS_VGPR = 256 // Indicates VGPR or AGPR
};
} // namespace EncValues
@@ -294,6 +305,9 @@ enum CPol {
SLC = 2,
DLC = 4,
SCC = 16,
+ SC0 = GLC,
+ SC1 = SCC,
+ NT = SLC,
ALL = GLC | SLC | DLC | SCC
};
@@ -302,24 +316,33 @@ enum CPol {
namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns.
enum Id { // Message ID, width(4) [3:0].
- ID_UNKNOWN_ = -1,
ID_INTERRUPT = 1,
- ID_GS = 2,
- ID_GS_DONE = 3,
- ID_SAVEWAVE = 4, // added in GFX8
+
+ ID_GS_PreGFX11 = 2, // replaced in GFX11
+ ID_GS_DONE_PreGFX11 = 3, // replaced in GFX11
+
+ ID_HS_TESSFACTOR_GFX11Plus = 2, // reused in GFX11
+ ID_DEALLOC_VGPRS_GFX11Plus = 3, // reused in GFX11
+
+ ID_SAVEWAVE = 4, // added in GFX8, removed in GFX11
ID_STALL_WAVE_GEN = 5, // added in GFX9
ID_HALT_WAVES = 6, // added in GFX9
ID_ORDERED_PS_DONE = 7, // added in GFX9
ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10
ID_GS_ALLOC_REQ = 9, // added in GFX9
- ID_GET_DOORBELL = 10, // added in GFX9
- ID_GET_DDID = 11, // added in GFX10
+ ID_GET_DOORBELL = 10, // added in GFX9, removed in GFX11
+ ID_GET_DDID = 11, // added in GFX10, removed in GFX11
ID_SYSMSG = 15,
- ID_GAPS_LAST_, // Indicate that sequence has gaps.
- ID_GAPS_FIRST_ = ID_INTERRUPT,
- ID_SHIFT_ = 0,
- ID_WIDTH_ = 4,
- ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
+
+ ID_RTN_GET_DOORBELL = 128,
+ ID_RTN_GET_DDID = 129,
+ ID_RTN_GET_TMA = 130,
+ ID_RTN_GET_REALTIME = 131,
+ ID_RTN_SAVE_WAVE = 132,
+ ID_RTN_GET_TBA = 133,
+
+ ID_MASK_PreGFX11_ = 0xF,
+ ID_MASK_GFX11Plus_ = 0xFF
};
enum Op { // Both GS and SYS operation IDs.
@@ -360,8 +383,6 @@ enum StreamId : unsigned { // Stream ID, (2) [9:8].
namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns.
enum Id { // HwRegCode, (6) [5:0]
- ID_UNKNOWN_ = -1,
- ID_SYMBOLIC_FIRST_ = 1, // There are corresponding symbolic names defined.
ID_MODE = 1,
ID_STATUS = 2,
ID_TRAPSTS = 3,
@@ -370,12 +391,15 @@ enum Id { // HwRegCode, (6) [5:0]
ID_LDS_ALLOC = 6,
ID_IB_STS = 7,
ID_MEM_BASES = 15,
- ID_SYMBOLIC_FIRST_GFX9_ = ID_MEM_BASES,
ID_TBA_LO = 16,
- ID_SYMBOLIC_FIRST_GFX10_ = ID_TBA_LO,
ID_TBA_HI = 17,
ID_TMA_LO = 18,
ID_TMA_HI = 19,
+ ID_XCC_ID = 20,
+ ID_SQ_PERF_SNAPSHOT_DATA = 21,
+ ID_SQ_PERF_SNAPSHOT_DATA1 = 22,
+ ID_SQ_PERF_SNAPSHOT_PC_LO = 23,
+ ID_SQ_PERF_SNAPSHOT_PC_HI = 24,
ID_FLAT_SCR_LO = 20,
ID_FLAT_SCR_HI = 21,
ID_XNACK_MASK = 22,
@@ -383,8 +407,7 @@ enum Id { // HwRegCode, (6) [5:0]
ID_HW_ID2 = 24,
ID_POPS_PACKER = 25,
ID_SHADER_CYCLES = 29,
- ID_SYMBOLIC_FIRST_GFX1030_ = ID_SHADER_CYCLES,
- ID_SYMBOLIC_LAST_ = 30,
+
ID_SHIFT_ = 0,
ID_WIDTH_ = 6,
ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
@@ -503,6 +526,15 @@ enum MergedFormat : int64_t {
DFMT_NFMT_MAX = DFMT_NFMT_MASK
};
+enum UnifiedFormatCommon : int64_t {
+ UFMT_MAX = 127,
+ UFMT_UNDEF = -1,
+ UFMT_DEFAULT = 1
+};
+
+} // namespace MTBUFFormat
+
+namespace UfmtGFX10 {
enum UnifiedFormat : int64_t {
UFMT_INVALID = 0,
@@ -598,14 +630,95 @@ enum UnifiedFormat : int64_t {
UFMT_FIRST = UFMT_INVALID,
UFMT_LAST = UFMT_32_32_32_32_FLOAT,
+};
- UFMT_MAX = 127,
+} // namespace UfmtGFX10
- UFMT_UNDEF = -1,
- UFMT_DEFAULT = UFMT_8_UNORM
+namespace UfmtGFX11 {
+enum UnifiedFormat : int64_t {
+ UFMT_INVALID = 0,
+
+ UFMT_8_UNORM,
+ UFMT_8_SNORM,
+ UFMT_8_USCALED,
+ UFMT_8_SSCALED,
+ UFMT_8_UINT,
+ UFMT_8_SINT,
+
+ UFMT_16_UNORM,
+ UFMT_16_SNORM,
+ UFMT_16_USCALED,
+ UFMT_16_SSCALED,
+ UFMT_16_UINT,
+ UFMT_16_SINT,
+ UFMT_16_FLOAT,
+
+ UFMT_8_8_UNORM,
+ UFMT_8_8_SNORM,
+ UFMT_8_8_USCALED,
+ UFMT_8_8_SSCALED,
+ UFMT_8_8_UINT,
+ UFMT_8_8_SINT,
+
+ UFMT_32_UINT,
+ UFMT_32_SINT,
+ UFMT_32_FLOAT,
+
+ UFMT_16_16_UNORM,
+ UFMT_16_16_SNORM,
+ UFMT_16_16_USCALED,
+ UFMT_16_16_SSCALED,
+ UFMT_16_16_UINT,
+ UFMT_16_16_SINT,
+ UFMT_16_16_FLOAT,
+
+ UFMT_10_11_11_FLOAT,
+
+ UFMT_11_11_10_FLOAT,
+
+ UFMT_10_10_10_2_UNORM,
+ UFMT_10_10_10_2_SNORM,
+ UFMT_10_10_10_2_UINT,
+ UFMT_10_10_10_2_SINT,
+
+ UFMT_2_10_10_10_UNORM,
+ UFMT_2_10_10_10_SNORM,
+ UFMT_2_10_10_10_USCALED,
+ UFMT_2_10_10_10_SSCALED,
+ UFMT_2_10_10_10_UINT,
+ UFMT_2_10_10_10_SINT,
+
+ UFMT_8_8_8_8_UNORM,
+ UFMT_8_8_8_8_SNORM,
+ UFMT_8_8_8_8_USCALED,
+ UFMT_8_8_8_8_SSCALED,
+ UFMT_8_8_8_8_UINT,
+ UFMT_8_8_8_8_SINT,
+
+ UFMT_32_32_UINT,
+ UFMT_32_32_SINT,
+ UFMT_32_32_FLOAT,
+
+ UFMT_16_16_16_16_UNORM,
+ UFMT_16_16_16_16_SNORM,
+ UFMT_16_16_16_16_USCALED,
+ UFMT_16_16_16_16_SSCALED,
+ UFMT_16_16_16_16_UINT,
+ UFMT_16_16_16_16_SINT,
+ UFMT_16_16_16_16_FLOAT,
+
+ UFMT_32_32_32_UINT,
+ UFMT_32_32_32_SINT,
+ UFMT_32_32_32_FLOAT,
+ UFMT_32_32_32_32_UINT,
+ UFMT_32_32_32_32_SINT,
+ UFMT_32_32_32_32_FLOAT,
+
+ UFMT_FIRST = UFMT_INVALID,
+ UFMT_LAST = UFMT_32_32_32_32_FLOAT,
};
-} // namespace MTBUFFormat
+} // namespace UfmtGFX11
namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32.
@@ -746,20 +859,23 @@ enum Target : unsigned {
ET_MRT0 = 0,
ET_MRT7 = 7,
ET_MRTZ = 8,
- ET_NULL = 9,
+ ET_NULL = 9, // Pre-GFX11
ET_POS0 = 12,
ET_POS3 = 15,
- ET_POS4 = 16, // GFX10+
- ET_POS_LAST = ET_POS4, // Highest pos used on any subtarget
- ET_PRIM = 20, // GFX10+
- ET_PARAM0 = 32,
- ET_PARAM31 = 63,
+ ET_POS4 = 16, // GFX10+
+ ET_POS_LAST = ET_POS4, // Highest pos used on any subtarget
+ ET_PRIM = 20, // GFX10+
+ ET_DUAL_SRC_BLEND0 = 21, // GFX11+
+ ET_DUAL_SRC_BLEND1 = 22, // GFX11+
+ ET_PARAM0 = 32, // Pre-GFX11
+ ET_PARAM31 = 63, // Pre-GFX11
ET_NULL_MAX_IDX = 0,
ET_MRTZ_MAX_IDX = 0,
ET_PRIM_MAX_IDX = 0,
ET_MRT_MAX_IDX = 7,
ET_POS_MAX_IDX = 4,
+ ET_DUAL_SRC_BLEND_MAX_IDX = 1,
ET_PARAM_MAX_IDX = 31,
ET_INVALID = 255,
@@ -777,6 +893,18 @@ enum OpSel : uint64_t {
} // namespace VOP3PEncoding
+namespace ImplicitArg {
+// Implicit kernel argument offset for code object version 5.
+enum Offset_COV5 : unsigned {
+ HOSTCALL_PTR_OFFSET = 80,
+ MULTIGRID_SYNC_ARG_OFFSET = 88,
+ HEAP_PTR_OFFSET = 96,
+ PRIVATE_BASE_OFFSET = 192,
+ SHARED_BASE_OFFSET = 196,
+ QUEUE_PTR_OFFSET = 200,
+};
+
+} // namespace ImplicitArg
} // namespace AMDGPU
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
@@ -911,10 +1039,12 @@ enum OpSel : uint64_t {
#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860
-#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12)
+#define S_00B860_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12)
+#define S_00B860_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12)
#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8
-#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12)
+#define S_0286E8_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12)
+#define S_0286E8_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12)
#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54
#define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 33954e11d6c6..99aa8a60b04f 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -92,7 +92,7 @@ public:
bool tryFoldCndMask(MachineInstr &MI) const;
bool tryFoldZeroHighBits(MachineInstr &MI) const;
- void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
+ bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
const MachineOperand *isClamp(const MachineInstr &MI) const;
bool tryFoldClamp(MachineInstr &MI);
@@ -146,30 +146,6 @@ static unsigned macToMad(unsigned Opc) {
return AMDGPU::INSTRUCTION_LIST_END;
}
-// Wrapper around isInlineConstant that understands special cases when
-// instruction types are replaced during operand folding.
-static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
- const MachineInstr &UseMI,
- unsigned OpNo,
- const MachineOperand &OpToFold) {
- if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
- return true;
-
- unsigned Opc = UseMI.getOpcode();
- unsigned NewOpc = macToMad(Opc);
- if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
- // Special case for mac. Since this is replaced with mad when folded into
- // src2, we need to check the legality for the final instruction.
- int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
- if (static_cast<int>(OpNo) == Src2Idx) {
- const MCInstrDesc &MadDesc = TII->get(NewOpc);
- return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
- }
- }
-
- return false;
-}
-
// TODO: Add heuristic that the frame index might not fit in the addressing mode
// immediate offset to avoid materializing in loops.
static bool frameIndexMayFold(const SIInstrInfo *TII,
@@ -210,6 +186,8 @@ static bool updateOperand(FoldCandidate &Fold,
if (Fold.isImm()) {
if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
!(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) &&
+ (!ST.hasDOTOpSelHazard() ||
+ !(MI->getDesc().TSFlags & SIInstrFlags::IsDOT)) &&
AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
ST.hasInv2PiInlineImm())) {
// Set op_sel/op_sel_hi on this operand or bail out if op_sel is
@@ -289,7 +267,7 @@ static bool updateOperand(FoldCandidate &Fold,
// when looking at a use.
Dst0.setReg(NewReg0);
for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
- MI->RemoveOperand(I);
+ MI->removeOperand(I);
MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));
if (Fold.isCommuted())
@@ -490,6 +468,8 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::V_MOV_B64_e32:
+ case AMDGPU::V_MOV_B64_e64:
// Do not fold into an indirect mov.
return !MI.hasRegisterImplicitUseOperand(AMDGPU::M0);
}
@@ -675,7 +655,9 @@ void SIFoldOperands::foldOperand(
if (TII->isFLATScratch(*UseMI) &&
AMDGPU::getNamedOperandIdx(UseMI->getOpcode(),
- AMDGPU::OpName::vaddr) != -1) {
+ AMDGPU::OpName::vaddr) != -1 &&
+ AMDGPU::getNamedOperandIdx(UseMI->getOpcode(),
+ AMDGPU::OpName::saddr) == -1) {
unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(UseMI->getOpcode());
UseMI->setDesc(TII->get(NewOpc));
}
@@ -739,7 +721,7 @@ void SIFoldOperands::foldOperand(
while (ImpOpI != ImpOpE) {
MachineInstr::mop_iterator Tmp = ImpOpI;
ImpOpI++;
- UseMI->RemoveOperand(UseMI->getOperandNo(Tmp));
+ UseMI->removeOperand(UseMI->getOperandNo(Tmp));
}
CopiesToReplace.push_back(UseMI);
} else {
@@ -768,7 +750,7 @@ void SIFoldOperands::foldOperand(
UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
- UseMI->RemoveOperand(I);
+ UseMI->removeOperand(I);
MachineInstrBuilder B(*MBB.getParent(), UseMI);
DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
@@ -871,7 +853,7 @@ void SIFoldOperands::foldOperand(
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
else
UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
- UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
+ UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
return;
}
@@ -890,7 +872,7 @@ void SIFoldOperands::foldOperand(
UseMI->getOperand(1).setReg(OpToFold.getReg());
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
UseMI->getOperand(1).setIsKill(false);
- UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
+ UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
return;
}
}
@@ -906,6 +888,22 @@ void SIFoldOperands::foldOperand(
}
if (!FoldingImmLike) {
+ if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
+ // Don't fold if OpToFold doesn't hold an aligned register.
+ const TargetRegisterClass *RC =
+ TRI->getRegClassForReg(*MRI, OpToFold.getReg());
+ if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
+ unsigned SubReg = OpToFold.getSubReg();
+ const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg);
+ RC = TRI->getCompatibleSubRegClass(RC, SubRC, SubReg);
+ if (RC)
+ RC = SubRC;
+ }
+
+ if (!RC || !TRI->isProperlyAlignedRC(*RC))
+ return;
+ }
+
tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
// FIXME: We could try to change the instruction from 64-bit to 32-bit
@@ -1025,7 +1023,7 @@ static void stripExtraCopyOperands(MachineInstr &MI) {
Desc.getNumImplicitDefs();
for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
- MI.RemoveOperand(I);
+ MI.removeOperand(I);
}
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
@@ -1093,7 +1091,7 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII,
// Be careful to change the right operand, src0 may belong to a different
// instruction.
MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
- MI->RemoveOperand(Src1Idx);
+ MI->removeOperand(Src1Idx);
mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
return true;
}
@@ -1112,11 +1110,11 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII,
Opc == AMDGPU::S_OR_B32) {
if (Src1Val == 0) {
// y = or x, 0 => y = copy x
- MI->RemoveOperand(Src1Idx);
+ MI->removeOperand(Src1Idx);
mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
} else if (Src1Val == -1) {
// y = or x, -1 => y = v_mov_b32 -1
- MI->RemoveOperand(Src1Idx);
+ MI->removeOperand(Src1Idx);
mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
} else
return false;
@@ -1129,11 +1127,11 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII,
MI->getOpcode() == AMDGPU::S_AND_B32) {
if (Src1Val == 0) {
// y = and x, 0 => y = v_mov_b32 0
- MI->RemoveOperand(Src0Idx);
+ MI->removeOperand(Src0Idx);
mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
} else if (Src1Val == -1) {
// y = and x, -1 => y = copy x
- MI->RemoveOperand(Src1Idx);
+ MI->removeOperand(Src1Idx);
mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
stripExtraCopyOperands(*MI);
} else
@@ -1147,7 +1145,7 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII,
MI->getOpcode() == AMDGPU::S_XOR_B32) {
if (Src1Val == 0) {
// y = xor x, 0 => y = copy x
- MI->RemoveOperand(Src1Idx);
+ MI->removeOperand(Src1Idx);
mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
return true;
}
@@ -1185,12 +1183,12 @@ bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (Src2Idx != -1)
- MI.RemoveOperand(Src2Idx);
- MI.RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
+ MI.removeOperand(Src2Idx);
+ MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
if (Src1ModIdx != -1)
- MI.RemoveOperand(Src1ModIdx);
+ MI.removeOperand(Src1ModIdx);
if (Src0ModIdx != -1)
- MI.RemoveOperand(Src0ModIdx);
+ MI.removeOperand(Src0ModIdx);
mutateCopyOp(MI, NewDesc);
LLVM_DEBUG(dbgs() << MI);
return true;
@@ -1217,7 +1215,7 @@ bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
return false;
}
-void SIFoldOperands::foldInstOperand(MachineInstr &MI,
+bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
MachineOperand &OpToFold) const {
// We need mutate the operands of new mov instructions to add implicit
// uses of EXEC, but adding them invalidates the use_iterator, so defer
@@ -1225,6 +1223,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
SmallVector<MachineInstr *, 4> CopiesToReplace;
SmallVector<FoldCandidate, 4> FoldList;
MachineOperand &Dst = MI.getOperand(0);
+ bool Changed = false;
if (OpToFold.isImm()) {
for (auto &UseMI :
@@ -1237,66 +1236,25 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
// We may also encounter cases where one or both operands are
// immediates materialized into a register, which would ordinarily not
// be folded due to multiple uses or operand constraints.
- if (tryConstantFoldOp(*MRI, TII, &UseMI))
+ if (tryConstantFoldOp(*MRI, TII, &UseMI)) {
LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
- }
- }
-
- bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
- if (FoldingImm) {
- unsigned NumLiteralUses = 0;
- MachineOperand *NonInlineUse = nullptr;
- int NonInlineUseOpNo = -1;
-
- for (auto &Use :
- make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) {
- MachineInstr *UseMI = Use.getParent();
- unsigned OpNo = UseMI->getOperandNo(&Use);
-
- // Try to fold any inline immediate uses, and then only fold other
- // constants if they have one use.
- //
- // The legality of the inline immediate must be checked based on the use
- // operand, not the defining instruction, because 32-bit instructions
- // with 32-bit inline immediate sources may be used to materialize
- // constants used in 16-bit operands.
- //
- // e.g. it is unsafe to fold:
- // s_mov_b32 s0, 1.0 // materializes 0x3f800000
- // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
-
- // Folding immediates with more than one use will increase program size.
- // FIXME: This will also reduce register usage, which may be better
- // in some cases. A better heuristic is needed.
- if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
- foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
- } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
- foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
- } else {
- if (++NumLiteralUses == 1) {
- NonInlineUse = &Use;
- NonInlineUseOpNo = OpNo;
- }
+ Changed = true;
}
}
+ }
- if (NumLiteralUses == 1) {
- MachineInstr *UseMI = NonInlineUse->getParent();
- foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
- }
- } else {
- // Folding register.
- SmallVector <MachineOperand *, 4> UsesToProcess;
- for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
- UsesToProcess.push_back(&Use);
- for (auto U : UsesToProcess) {
- MachineInstr *UseMI = U->getParent();
-
- foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U),
- FoldList, CopiesToReplace);
- }
+ SmallVector<MachineOperand *, 4> UsesToProcess;
+ for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
+ UsesToProcess.push_back(&Use);
+ for (auto U : UsesToProcess) {
+ MachineInstr *UseMI = U->getParent();
+ foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
+ CopiesToReplace);
}
+ if (CopiesToReplace.empty() && FoldList.empty())
+ return Changed;
+
MachineFunction *MF = MI.getParent()->getParent();
// Make sure we add EXEC uses to any new v_mov instructions created.
for (MachineInstr *Copy : CopiesToReplace)
@@ -1328,6 +1286,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
TII->commuteInstruction(*Fold.UseMI, false);
}
}
+ return true;
}
// Clamp patterns are canonically selected to v_max_* instructions, so only
@@ -1593,8 +1552,9 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
unsigned OpIdx = Op - &UseMI->getOperand(0);
const MCInstrDesc &InstDesc = UseMI->getDesc();
- if (!TRI->isVectorSuperClass(
- TRI->getRegClass(InstDesc.OpInfo[OpIdx].RegClass)))
+ const TargetRegisterClass *OpRC =
+ TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
+ if (!OpRC || !TRI->isVectorSuperClass(OpRC))
return false;
const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
@@ -1751,22 +1711,31 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
bool IsIEEEMode = MFI->getMode().IEEE;
bool HasNSZ = MFI->hasNoSignedZerosFPMath();
+ bool Changed = false;
for (MachineBasicBlock *MBB : depth_first(&MF)) {
MachineOperand *CurrentKnownM0Val = nullptr;
for (auto &MI : make_early_inc_range(*MBB)) {
- tryFoldCndMask(MI);
+ Changed |= tryFoldCndMask(MI);
- if (tryFoldZeroHighBits(MI))
+ if (tryFoldZeroHighBits(MI)) {
+ Changed = true;
continue;
+ }
- if (MI.isRegSequence() && tryFoldRegSequence(MI))
+ if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
+ Changed = true;
continue;
+ }
- if (MI.isPHI() && tryFoldLCSSAPhi(MI))
+ if (MI.isPHI() && tryFoldLCSSAPhi(MI)) {
+ Changed = true;
continue;
+ }
- if (MI.mayLoad() && tryFoldLoad(MI))
+ if (MI.mayLoad() && tryFoldLoad(MI)) {
+ Changed = true;
continue;
+ }
if (!TII->isFoldableCopy(MI)) {
// Saw an unknown clobber of m0, so we no longer know what it is.
@@ -1777,7 +1746,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
// instruction, and not the omod multiply.
if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
!tryFoldOMod(MI))
- tryFoldClamp(MI);
+ Changed |= tryFoldClamp(MI);
continue;
}
@@ -1788,6 +1757,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
MachineOperand &NewM0Val = MI.getOperand(1);
if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
MI.eraseFromParent();
+ Changed = true;
continue;
}
@@ -1817,7 +1787,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (!MI.getOperand(0).getReg().isVirtual())
continue;
- foldInstOperand(MI, OpToFold);
+ Changed |= foldInstOperand(MI, OpToFold);
// If we managed to fold all uses of this copy then we might as well
// delete it now.
@@ -1829,6 +1799,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
auto &SrcOp = InstToErase->getOperand(1);
auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
InstToErase->eraseFromParent();
+ Changed = true;
InstToErase = nullptr;
if (!SrcReg || SrcReg.isPhysical())
break;
@@ -1837,9 +1808,11 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
break;
}
if (InstToErase && InstToErase->isRegSequence() &&
- MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg()))
+ MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
InstToErase->eraseFromParent();
+ Changed = true;
+ }
}
}
- return true;
+ return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 80ee7a00252a..d7ca7f36284b 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -241,7 +241,7 @@ void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
}
// Check register def/use conflicts, occupancy limits and collect def/use maps.
-// Return true if instruction can be bundled with previous. It it cannot
+// Return true if instruction can be bundled with previous. If it cannot
// def/use maps are not updated.
bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI,
RegUse &Defs, RegUse &Uses,
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6078f4a0577a..a57e81eb4e4a 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -749,7 +749,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
return;
}
- const MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -789,19 +789,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
*Reg.FI);
}
- // VGPRs used for Whole Wave Mode
- for (const auto &Reg : FuncInfo->WWMReservedRegs) {
- auto VGPR = Reg.first;
- auto FI = Reg.second;
- if (!FI)
- continue;
-
+ for (auto ReservedWWM : FuncInfo->wwmAllocation()) {
if (!ScratchExecCopy)
ScratchExecCopy =
buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true);
- buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, VGPR,
- *FI);
+ buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
+ std::get<0>(ReservedWWM), std::get<1>(ReservedWWM));
}
if (ScratchExecCopy) {
@@ -813,9 +807,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
LiveRegs.addReg(ScratchExecCopy);
}
- if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) {
- const int FramePtrFI = *FPSaveIndex;
- assert(!MFI.isDeadObjectIndex(FramePtrFI));
+ auto SaveSGPRToMemory = [&](Register Reg, const int FI) {
+ assert(!MFI.isDeadObjectIndex(FI));
initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
@@ -825,62 +818,31 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
report_fatal_error("failed to find free scratch register");
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
- .addReg(FramePtrReg);
+ .addReg(Reg);
buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR,
- FramePtrFI);
- }
-
- if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) {
- const int BasePtrFI = *BPSaveIndex;
- assert(!MFI.isDeadObjectIndex(BasePtrFI));
-
- initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
+ FI);
+ };
- MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
- if (!TmpVGPR)
- report_fatal_error("failed to find free scratch register");
-
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
- .addReg(BasePtrReg);
+ auto SaveSGPRToVGPRLane = [&](Register Reg, const int FI) {
+ assert(!MFI.isDeadObjectIndex(FI));
- buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR,
- BasePtrFI);
- }
-
- // In this case, spill the FP to a reserved VGPR.
- if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) {
- const int FramePtrFI = *FPSaveIndex;
- assert(!MFI.isDeadObjectIndex(FramePtrFI));
-
- assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill);
- ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
- FuncInfo->getSGPRToVGPRSpills(FramePtrFI);
+ assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIRegisterInfo::SpilledReg> Spill =
+ FuncInfo->getSGPRToVGPRSpills(FI);
assert(Spill.size() == 1);
- // Save FP before setting it up.
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
- .addReg(FramePtrReg)
+ .addReg(Reg)
.addImm(Spill[0].Lane)
.addReg(Spill[0].VGPR, RegState::Undef);
- }
+ };
- // In this case, spill the BP to a reserved VGPR.
- if (BPSaveIndex && !spilledToMemory(MF, *BPSaveIndex)) {
- const int BasePtrFI = *BPSaveIndex;
- assert(!MFI.isDeadObjectIndex(BasePtrFI));
-
- assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
- ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
- FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
- assert(Spill.size() == 1);
-
- // Save BP before setting it up.
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
- .addReg(BasePtrReg)
- .addImm(Spill[0].Lane)
- .addReg(Spill[0].VGPR, RegState::Undef);
+ if (FPSaveIndex) {
+ if (spilledToMemory(MF, *FPSaveIndex))
+ SaveSGPRToMemory(FramePtrReg, *FPSaveIndex);
+ else
+ SaveSGPRToVGPRLane(FramePtrReg, *FPSaveIndex);
}
// Emit the copy if we need an FP, and are using a free SGPR to save it.
@@ -891,6 +853,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
+ if (BPSaveIndex) {
+ if (spilledToMemory(MF, *BPSaveIndex))
+ SaveSGPRToMemory(BasePtrReg, *BPSaveIndex);
+ else
+ SaveSGPRToVGPRLane(BasePtrReg, *BPSaveIndex);
+ }
+
// Emit the copy if we need a BP, and are using a free SGPR to save it.
if (FuncInfo->SGPRForBPSaveRestoreCopy) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
@@ -1034,56 +1003,44 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameDestroy);
}
+ auto RestoreSGPRFromMemory = [&](Register Reg, const int FI) {
+ initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
+ MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+ if (!TmpVGPR)
+ report_fatal_error("failed to find free scratch register");
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR,
+ FI);
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .addReg(TmpVGPR, RegState::Kill);
+ };
+
+ auto RestoreSGPRFromVGPRLane = [&](Register Reg, const int FI) {
+ assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIRegisterInfo::SpilledReg> Spill =
+ FuncInfo->getSGPRToVGPRSpills(FI);
+ assert(Spill.size() == 1);
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), Reg)
+ .addReg(Spill[0].VGPR)
+ .addImm(Spill[0].Lane);
+ };
+
if (FPSaveIndex) {
const int FramePtrFI = *FPSaveIndex;
assert(!MFI.isDeadObjectIndex(FramePtrFI));
- if (spilledToMemory(MF, FramePtrFI)) {
- initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
-
- MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
- if (!TmpVGPR)
- report_fatal_error("failed to find free scratch register");
- buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
- TmpVGPR, FramePtrFI);
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
- .addReg(TmpVGPR, RegState::Kill);
- } else {
- // Reload from VGPR spill.
- assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill);
- ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
- FuncInfo->getSGPRToVGPRSpills(FramePtrFI);
- assert(Spill.size() == 1);
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), FramePtrReg)
- .addReg(Spill[0].VGPR)
- .addImm(Spill[0].Lane);
- }
+ if (spilledToMemory(MF, FramePtrFI))
+ RestoreSGPRFromMemory(FramePtrReg, FramePtrFI);
+ else
+ RestoreSGPRFromVGPRLane(FramePtrReg, FramePtrFI);
}
if (BPSaveIndex) {
const int BasePtrFI = *BPSaveIndex;
assert(!MFI.isDeadObjectIndex(BasePtrFI));
- if (spilledToMemory(MF, BasePtrFI)) {
- initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
-
- MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
- if (!TmpVGPR)
- report_fatal_error("failed to find free scratch register");
- buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
- TmpVGPR, BasePtrFI);
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
- .addReg(TmpVGPR, RegState::Kill);
- } else {
- // Reload from VGPR spill.
- assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
- ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
- FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
- assert(Spill.size() == 1);
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), BasePtrReg)
- .addReg(Spill[0].VGPR)
- .addImm(Spill[0].Lane);
- }
+ if (spilledToMemory(MF, BasePtrFI))
+ RestoreSGPRFromMemory(BasePtrReg, BasePtrFI);
+ else
+ RestoreSGPRFromVGPRLane(BasePtrReg, BasePtrFI);
}
Register ScratchExecCopy;
@@ -1100,18 +1057,13 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
Reg.VGPR, *Reg.FI);
}
- for (const auto &Reg : FuncInfo->WWMReservedRegs) {
- auto VGPR = Reg.first;
- auto FI = Reg.second;
- if (!FI)
- continue;
-
+ for (auto ReservedWWM : FuncInfo->wwmAllocation()) {
if (!ScratchExecCopy)
ScratchExecCopy =
buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
- buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, VGPR,
- *FI);
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
+ std::get<0>(ReservedWWM), std::get<1>(ReservedWWM));
}
if (ScratchExecCopy) {
@@ -1161,6 +1113,11 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ if (!FuncInfo->isEntryFunction()) {
+ // Spill VGPRs used for Whole Wave Mode
+ FuncInfo->allocateWWMReservedSpillSlots(MFI, *TRI);
+ }
+
const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
&& EnableSpillVGPRToAGPR;
@@ -1200,7 +1157,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
}
}
- // Stack slot coloring may assign different objets to the same stack slot.
+ // Stack slot coloring may assign different objects to the same stack slot.
// If not, then the VGPR to AGPR spill slot is dead.
for (unsigned FI : SpillFIs.set_bits())
if (!NonVGPRSpillFIs.test(FI))
@@ -1229,7 +1186,11 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
}
}
- FuncInfo->removeDeadFrameIndices(MFI);
+ // At this point we've already allocated all spilled SGPRs to VGPRs if we
+ // can. Any remaining SGPR spills will go to memory, so move them back to the
+ // default stack.
+ bool HaveSGPRToVMemSpill =
+ FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
assert(allSGPRSpillsAreDead(MF) &&
"SGPR spill should have been removed in SILowerSGPRSpills");
@@ -1241,6 +1202,39 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
// Add an emergency spill slot
RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
+
+ // If we are spilling SGPRs to memory with a large frame, we may need a
+ // second VGPR emergency frame index.
+ if (HaveSGPRToVMemSpill &&
+ allocateScavengingFrameIndexesNearIncomingSP(MF)) {
+ RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
+ }
+ }
+}
+
+void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
+ MachineFunction &MF, RegScavenger *RS) const {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+
+ if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
+ // On gfx908, we had initially reserved highest available VGPR for AGPR
+ // copy. Now since we are done with RA, check if there exist an unused VGPR
+ // which is lower than the eariler reserved VGPR before RA. If one exist,
+ // use it for AGPR copy instead of one reserved before RA.
+ Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
+ Register UnusedLowVGPR =
+ TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+ if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
+ TRI->getHWRegIndex(VGPRForAGPRCopy))) {
+ // Call to setVGPRForAGPRCopy() should happen first before calling
+ // freezeReservedRegs() so that getReservedRegs() can reserve this newly
+ // identified VGPR (for AGPR copy).
+ FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
+ MRI.freezeReservedRegs(MF);
+ }
}
}
@@ -1333,6 +1327,20 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
// FP will be specially managed like SP.
if (WillHaveFP || hasFP(MF))
SavedRegs.reset(MFI->getFrameOffsetReg());
+
+ // Return address use with return instruction is hidden through the SI_RETURN
+ // pseudo. Given that and since the IPRA computes actual register usage and
+ // does not use CSR list, the clobbering of return address by function calls
+ // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
+ // usage collection. This will ensure save/restore of return address happens
+ // in those scenarios.
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ Register RetAddrReg = TRI->getReturnAddressReg(MF);
+ if (!MFI->isEntryFunction() &&
+ (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
+ SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
+ SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
+ }
}
bool SIFrameLowering::assignCalleeSavedSpillSlots(
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 7949dcfa6632..79154d494e91 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -47,6 +47,9 @@ public:
MachineFunction &MF,
RegScavenger *RS = nullptr) const override;
+ void processFunctionBeforeFrameIndicesReplaced(
+ MachineFunction &MF, RegScavenger *RS = nullptr) const override;
+
MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF,
MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e2f4a0896bc3..094d5cd58673 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -25,6 +26,7 @@
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
@@ -136,6 +138,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
+ addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
+ addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -151,27 +155,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setBooleanVectorContents(ZeroOrOneBooleanContent);
// We need to custom lower vector stores from local memory
- setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v6i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v7i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
- setOperationAction(ISD::LOAD, MVT::i1, Custom);
- setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
+ setOperationAction(ISD::LOAD,
+ {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
+ MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1,
+ MVT::v32i32},
+ Custom);
- setOperationAction(ISD::STORE, MVT::v2i32, Custom);
- setOperationAction(ISD::STORE, MVT::v3i32, Custom);
- setOperationAction(ISD::STORE, MVT::v4i32, Custom);
- setOperationAction(ISD::STORE, MVT::v5i32, Custom);
- setOperationAction(ISD::STORE, MVT::v6i32, Custom);
- setOperationAction(ISD::STORE, MVT::v7i32, Custom);
- setOperationAction(ISD::STORE, MVT::v8i32, Custom);
- setOperationAction(ISD::STORE, MVT::v16i32, Custom);
- setOperationAction(ISD::STORE, MVT::i1, Custom);
- setOperationAction(ISD::STORE, MVT::v32i32, Custom);
+ setOperationAction(ISD::STORE,
+ {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
+ MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1,
+ MVT::v32i32},
+ Custom);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
@@ -198,81 +192,57 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
- setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
- setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+ setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
setOperationAction(ISD::SELECT, MVT::i1, Promote);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
setOperationAction(ISD::SELECT, MVT::f64, Promote);
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
- setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+ setOperationAction(ISD::SELECT_CC,
+ {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
setOperationAction(ISD::SETCC, MVT::i1, Promote);
- setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
- setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
+ setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
- setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v16f32, Expand);
+ setOperationAction(ISD::TRUNCATE,
+ {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
+ MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32},
+ Expand);
+ setOperationAction(ISD::FP_ROUND,
+ {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
+ MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32},
+ Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v3i16, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG,
+ {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
+ MVT::v3i16, MVT::v4i16, MVT::Other},
+ Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
- setOperationAction(ISD::BR_CC, MVT::i1, Expand);
- setOperationAction(ISD::BR_CC, MVT::i32, Expand);
- setOperationAction(ISD::BR_CC, MVT::i64, Expand);
- setOperationAction(ISD::BR_CC, MVT::f32, Expand);
- setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+ setOperationAction(ISD::BR_CC,
+ {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
- setOperationAction(ISD::UADDO, MVT::i32, Legal);
- setOperationAction(ISD::USUBO, MVT::i32, Legal);
+ setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
- setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
- setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
+ setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i32, Legal);
- setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
- setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
- setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+ setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,
+ Expand);
#if 0
- setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
- setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
+ setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i64, Legal);
#endif
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
- for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
- MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
- MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
- MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
- MVT::v8i16, MVT::v8f16, MVT::v16i64, MVT::v16f64,
- MVT::v32i32, MVT::v32f32 }) {
+ for (MVT VT :
+ {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64,
+ MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v3i64, MVT::v3f64,
+ MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64,
+ MVT::v8f64, MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+ MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -372,94 +342,63 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
}
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE,
+ {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
+ Expand);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16}, Custom);
// Avoid stack access for these.
// TODO: Generalize to more vector types.
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
-
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
-
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
+ setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
+ {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
+ MVT::v4i16, MVT::v4f16, MVT::v16i16, MVT::v16f16},
+ Custom);
// Deal with vec3 vector operations when widened to vec4.
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR,
+ {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
// Deal with vec5/6/7 vector operations when widened to vec8.
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR,
+ {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
+ MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32},
+ Custom);
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
// and output demarshalling
- setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
- setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
// We can't return success/failure, only the old value,
// let LLVM add the comparison
- setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
- setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
+ Expand);
- if (Subtarget->hasFlatAddressSpace()) {
- setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
- setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
- }
+ if (Subtarget->hasFlatAddressSpace())
+ setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
- setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
- setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+ setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
// FIXME: This should be narrowed to i32, but that only happens if i64 is
// illegal.
// FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
- setOperationAction(ISD::BSWAP, MVT::i64, Legal);
- setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+ setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
// On SI this is s_memtime and s_memrealtime on VI.
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
- setOperationAction(ISD::TRAP, MVT::Other, Custom);
- setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
+ setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
if (Subtarget->has16BitInsts()) {
- setOperationAction(ISD::FPOW, MVT::f16, Promote);
- setOperationAction(ISD::FPOWI, MVT::f16, Promote);
- setOperationAction(ISD::FLOG, MVT::f16, Custom);
- setOperationAction(ISD::FEXP, MVT::f16, Custom);
- setOperationAction(ISD::FLOG10, MVT::f16, Custom);
+ setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
+ setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
}
if (Subtarget->hasMadMacF32Insts())
setOperationAction(ISD::FMAD, MVT::f32, Legal);
- if (!Subtarget->hasBFI()) {
+ if (!Subtarget->hasBFI())
// fcopysign can be done in a single instruction with BFI.
- setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
- }
+ setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
if (!Subtarget->hasBCNT(32))
setOperationAction(ISD::CTPOP, MVT::i32, Expand);
@@ -467,15 +406,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (!Subtarget->hasBCNT(64))
setOperationAction(ISD::CTPOP, MVT::i64, Expand);
- if (Subtarget->hasFFBH()) {
- setOperationAction(ISD::CTLZ, MVT::i32, Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
- }
+ if (Subtarget->hasFFBH())
+ setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
- if (Subtarget->hasFFBL()) {
- setOperationAction(ISD::CTTZ, MVT::i32, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
- }
+ if (Subtarget->hasFFBL())
+ setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
// We only really have 32-bit BFE instructions (and 16-bit on VI).
//
@@ -489,84 +424,48 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setHasExtractBitsInsn(true);
// Clamp modifier on add/sub
- if (Subtarget->hasIntClamp()) {
- setOperationAction(ISD::UADDSAT, MVT::i32, Legal);
- setOperationAction(ISD::USUBSAT, MVT::i32, Legal);
- }
-
- if (Subtarget->hasAddNoCarry()) {
- setOperationAction(ISD::SADDSAT, MVT::i16, Legal);
- setOperationAction(ISD::SSUBSAT, MVT::i16, Legal);
- setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
- setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
- }
+ if (Subtarget->hasIntClamp())
+ setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal);
- setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
- setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
- setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
- setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
+ if (Subtarget->hasAddNoCarry())
+ setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
+ Legal);
+ setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
+ Custom);
// These are really only legal for ieee_mode functions. We should be avoiding
// them for functions that don't have ieee_mode enabled, so just say they are
// legal.
- setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
- setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
- setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
- setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
+ setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
+ {MVT::f32, MVT::f64}, Legal);
-
- if (Subtarget->haveRoundOpsF64()) {
- setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
- setOperationAction(ISD::FCEIL, MVT::f64, Legal);
- setOperationAction(ISD::FRINT, MVT::f64, Legal);
- } else {
- setOperationAction(ISD::FCEIL, MVT::f64, Custom);
- setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
- setOperationAction(ISD::FRINT, MVT::f64, Custom);
- setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
- }
+ if (Subtarget->haveRoundOpsF64())
+ setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FRINT}, MVT::f64, Legal);
+ else
+ setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR},
+ MVT::f64, Custom);
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
- setOperationAction(ISD::FSIN, MVT::f32, Custom);
- setOperationAction(ISD::FCOS, MVT::f32, Custom);
- setOperationAction(ISD::FDIV, MVT::f32, Custom);
+ setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
setOperationAction(ISD::FDIV, MVT::f64, Custom);
if (Subtarget->has16BitInsts()) {
- setOperationAction(ISD::Constant, MVT::i16, Legal);
-
- setOperationAction(ISD::SMIN, MVT::i16, Legal);
- setOperationAction(ISD::SMAX, MVT::i16, Legal);
-
- setOperationAction(ISD::UMIN, MVT::i16, Legal);
- setOperationAction(ISD::UMAX, MVT::i16, Legal);
+ setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
+ ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
+ MVT::i16, Legal);
- setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
- setOperationAction(ISD::ROTR, MVT::i16, Expand);
- setOperationAction(ISD::ROTL, MVT::i16, Expand);
-
- setOperationAction(ISD::SDIV, MVT::i16, Promote);
- setOperationAction(ISD::UDIV, MVT::i16, Promote);
- setOperationAction(ISD::SREM, MVT::i16, Promote);
- setOperationAction(ISD::UREM, MVT::i16, Promote);
- setOperationAction(ISD::UADDSAT, MVT::i16, Legal);
- setOperationAction(ISD::USUBSAT, MVT::i16, Legal);
-
- setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
-
- setOperationAction(ISD::CTTZ, MVT::i16, Promote);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
- setOperationAction(ISD::CTLZ, MVT::i16, Promote);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
- setOperationAction(ISD::CTPOP, MVT::i16, Promote);
+ setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
+ MVT::i16, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
-
- setOperationAction(ISD::BR_CC, MVT::i16, Expand);
+ setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
+ ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
+ ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
+ ISD::CTPOP},
+ MVT::i16, Promote);
setOperationAction(ISD::LOAD, MVT::i16, Custom);
@@ -577,8 +476,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
- setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom);
+ setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
// F16 - Constant Actions.
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
@@ -590,22 +488,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
// F16 - VOP1 Actions.
- setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
- setOperationAction(ISD::FCOS, MVT::f16, Custom);
- setOperationAction(ISD::FSIN, MVT::f16, Custom);
+ setOperationAction(
+ {ISD::FP_ROUND, ISD::FCOS, ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
+ MVT::f16, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom);
+ setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
- setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
- setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
- setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
- setOperationAction(ISD::FROUND, MVT::f16, Custom);
+ setOperationAction(
+ {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP},
+ MVT::f16, Promote);
// F16 - VOP2 Actions.
- setOperationAction(ISD::BR_CC, MVT::f16, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+ setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
setOperationAction(ISD::FDIV, MVT::f16, Custom);
@@ -615,7 +509,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMAD, MVT::f16, Legal);
for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
- MVT::v8f16}) {
+ MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -639,16 +533,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
// v_perm_b32 can handle either of these.
- setOperationAction(ISD::BSWAP, MVT::i16, Legal);
- setOperationAction(ISD::BSWAP, MVT::v2i16, Legal);
+ setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
// XXX - Do these do anything? Vector constants turn into build_vector.
- setOperationAction(ISD::Constant, MVT::v2i16, Legal);
- setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
+ setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
- setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
- setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
+ setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal);
setOperationAction(ISD::STORE, MVT::v2i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
@@ -692,140 +583,98 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v8f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
- setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
+ setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
+ setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
+
+ setOperationAction(ISD::STORE, MVT::v16i16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
+ setOperationAction(ISD::STORE, MVT::v16f16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
+
+ setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
+ MVT::v2i32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
- setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
+ setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
+ MVT::v4i32, Expand);
- setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Expand);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Expand);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Expand);
+ setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
+ MVT::v8i32, Expand);
- if (!Subtarget->hasVOP3PInsts()) {
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
- }
+ if (!Subtarget->hasVOP3PInsts())
+ setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom);
setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
// This isn't really legal, but this avoids the legalizer unrolling it (and
// allows matching fneg (fabs x) patterns)
setOperationAction(ISD::FABS, MVT::v2f16, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
- setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
- setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
- setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
+ setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
+ setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
- setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
- setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
- setOperationAction(ISD::FMINNUM_IEEE, MVT::v8f16, Custom);
- setOperationAction(ISD::FMAXNUM_IEEE, MVT::v8f16, Custom);
+ setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom);
- setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
- setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
- setOperationAction(ISD::FMINNUM, MVT::v8f16, Expand);
- setOperationAction(ISD::FMAXNUM, MVT::v8f16, Expand);
+ setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand);
- for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) {
- setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom);
+ for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+ setOperationAction(
+ {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
+ Vec16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand);
}
}
if (Subtarget->hasVOP3PInsts()) {
- setOperationAction(ISD::ADD, MVT::v2i16, Legal);
- setOperationAction(ISD::SUB, MVT::v2i16, Legal);
- setOperationAction(ISD::MUL, MVT::v2i16, Legal);
- setOperationAction(ISD::SHL, MVT::v2i16, Legal);
- setOperationAction(ISD::SRL, MVT::v2i16, Legal);
- setOperationAction(ISD::SRA, MVT::v2i16, Legal);
- setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
- setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
- setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
- setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
-
- setOperationAction(ISD::UADDSAT, MVT::v2i16, Legal);
- setOperationAction(ISD::USUBSAT, MVT::v2i16, Legal);
- setOperationAction(ISD::SADDSAT, MVT::v2i16, Legal);
- setOperationAction(ISD::SSUBSAT, MVT::v2i16, Legal);
-
- setOperationAction(ISD::FADD, MVT::v2f16, Legal);
- setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
- setOperationAction(ISD::FMA, MVT::v2f16, Legal);
+ setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
+ ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
+ ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
+ MVT::v2i16, Legal);
- setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
- setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
+ setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
+ ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
+ MVT::v2f16, Legal);
- setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16},
+ Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE,
+ {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
+ MVT::v16f16, MVT::v16i16},
+ Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
-
- for (MVT VT : { MVT::v4i16, MVT::v8i16 }) {
+ for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
// Split vector operations.
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::ADD, VT, Custom);
- setOperationAction(ISD::SUB, VT, Custom);
- setOperationAction(ISD::MUL, VT, Custom);
-
- setOperationAction(ISD::SMIN, VT, Custom);
- setOperationAction(ISD::SMAX, VT, Custom);
- setOperationAction(ISD::UMIN, VT, Custom);
- setOperationAction(ISD::UMAX, VT, Custom);
-
- setOperationAction(ISD::UADDSAT, VT, Custom);
- setOperationAction(ISD::SADDSAT, VT, Custom);
- setOperationAction(ISD::USUBSAT, VT, Custom);
- setOperationAction(ISD::SSUBSAT, VT, Custom);
- }
+ setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
+ ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
+ ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
+ ISD::SSUBSAT},
+ VT, Custom);
- for (MVT VT : { MVT::v4f16, MVT::v8f16 }) {
+ for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
// Split vector operations.
- setOperationAction(ISD::FADD, VT, Custom);
- setOperationAction(ISD::FMUL, VT, Custom);
- setOperationAction(ISD::FMA, VT, Custom);
- setOperationAction(ISD::FCANONICALIZE, VT, Custom);
- }
-
- setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
- setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
+ setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
+ VT, Custom);
- setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
- setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
+ setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
+ Custom);
setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
- setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
- setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
+ setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16}, Custom);
if (Subtarget->hasPackedFP32Ops()) {
- setOperationAction(ISD::FADD, MVT::v2f32, Legal);
- setOperationAction(ISD::FMUL, MVT::v2f32, Legal);
- setOperationAction(ISD::FMA, MVT::v2f32, Legal);
- setOperationAction(ISD::FNEG, MVT::v2f32, Legal);
-
- for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) {
- setOperationAction(ISD::FADD, VT, Custom);
- setOperationAction(ISD::FMUL, VT, Custom);
- setOperationAction(ISD::FMA, VT, Custom);
- }
+ setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
+ MVT::v2f32, Legal);
+ setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA},
+ {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
+ Custom);
}
}
- setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
- setOperationAction(ISD::FABS, MVT::v4f16, Custom);
+ setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
@@ -834,107 +683,88 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
} else {
// Legalization hack.
- setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
- setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
+ setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
- setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
- setOperationAction(ISD::FABS, MVT::v2f16, Custom);
+ setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
}
- for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
- MVT::v8i16, MVT::v8f16 }) {
- setOperationAction(ISD::SELECT, VT, Custom);
- }
+ setOperationAction(ISD::SELECT,
+ {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
+ MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
+ Custom);
- setOperationAction(ISD::SMULO, MVT::i64, Custom);
- setOperationAction(ISD::UMULO, MVT::i64, Custom);
+ setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
- if (Subtarget->hasMad64_32()) {
- setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
- setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom);
- }
+ if (Subtarget->hasMad64_32())
+ setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN,
+ {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
+ MVT::v2i16, MVT::v2f16},
+ Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3i16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN,
+ {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
+ MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
+ MVT::i16, MVT::i8},
+ Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v3i16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v3f16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID,
+ {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
+ MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
+ MVT::i8},
+ Custom);
- setTargetDAGCombine(ISD::ADD);
- setTargetDAGCombine(ISD::ADDCARRY);
- setTargetDAGCombine(ISD::SUB);
- setTargetDAGCombine(ISD::SUBCARRY);
- setTargetDAGCombine(ISD::FADD);
- setTargetDAGCombine(ISD::FSUB);
- setTargetDAGCombine(ISD::FMINNUM);
- setTargetDAGCombine(ISD::FMAXNUM);
- setTargetDAGCombine(ISD::FMINNUM_IEEE);
- setTargetDAGCombine(ISD::FMAXNUM_IEEE);
- setTargetDAGCombine(ISD::FMA);
- setTargetDAGCombine(ISD::SMIN);
- setTargetDAGCombine(ISD::SMAX);
- setTargetDAGCombine(ISD::UMIN);
- setTargetDAGCombine(ISD::UMAX);
- setTargetDAGCombine(ISD::SETCC);
- setTargetDAGCombine(ISD::AND);
- setTargetDAGCombine(ISD::OR);
- setTargetDAGCombine(ISD::XOR);
- setTargetDAGCombine(ISD::SINT_TO_FP);
- setTargetDAGCombine(ISD::UINT_TO_FP);
- setTargetDAGCombine(ISD::FCANONICALIZE);
- setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
- setTargetDAGCombine(ISD::ZERO_EXTEND);
- setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
- setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
- setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine({ISD::ADD,
+ ISD::ADDCARRY,
+ ISD::SUB,
+ ISD::SUBCARRY,
+ ISD::FADD,
+ ISD::FSUB,
+ ISD::FMINNUM,
+ ISD::FMAXNUM,
+ ISD::FMINNUM_IEEE,
+ ISD::FMAXNUM_IEEE,
+ ISD::FMA,
+ ISD::SMIN,
+ ISD::SMAX,
+ ISD::UMIN,
+ ISD::UMAX,
+ ISD::SETCC,
+ ISD::AND,
+ ISD::OR,
+ ISD::XOR,
+ ISD::SINT_TO_FP,
+ ISD::UINT_TO_FP,
+ ISD::FCANONICALIZE,
+ ISD::SCALAR_TO_VECTOR,
+ ISD::ZERO_EXTEND,
+ ISD::SIGN_EXTEND_INREG,
+ ISD::EXTRACT_VECTOR_ELT,
+ ISD::INSERT_VECTOR_ELT});
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
- setTargetDAGCombine(ISD::LOAD);
- setTargetDAGCombine(ISD::STORE);
- setTargetDAGCombine(ISD::ATOMIC_LOAD);
- setTargetDAGCombine(ISD::ATOMIC_STORE);
- setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
- setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
- setTargetDAGCombine(ISD::ATOMIC_SWAP);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
- setTargetDAGCombine(ISD::INTRINSIC_VOID);
- setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+ setTargetDAGCombine({ISD::LOAD,
+ ISD::STORE,
+ ISD::ATOMIC_LOAD,
+ ISD::ATOMIC_STORE,
+ ISD::ATOMIC_CMP_SWAP,
+ ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
+ ISD::ATOMIC_SWAP,
+ ISD::ATOMIC_LOAD_ADD,
+ ISD::ATOMIC_LOAD_SUB,
+ ISD::ATOMIC_LOAD_AND,
+ ISD::ATOMIC_LOAD_OR,
+ ISD::ATOMIC_LOAD_XOR,
+ ISD::ATOMIC_LOAD_NAND,
+ ISD::ATOMIC_LOAD_MIN,
+ ISD::ATOMIC_LOAD_MAX,
+ ISD::ATOMIC_LOAD_UMIN,
+ ISD::ATOMIC_LOAD_UMAX,
+ ISD::ATOMIC_LOAD_FADD,
+ ISD::INTRINSIC_VOID,
+ ISD::INTRINSIC_W_CHAIN});
// FIXME: In other contexts we pretend this is a per-function property.
setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
@@ -1118,6 +948,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
unsigned IntrID) const {
+ Info.flags = MachineMemOperand::MONone;
+ if (CI.hasMetadata(LLVMContext::MD_invariant_load))
+ Info.flags |= MachineMemOperand::MOInvariant;
+
if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
@@ -1127,16 +961,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTargetMachine());
+
if (RsrcIntr->IsImage) {
- Info.ptrVal =
- MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+ Info.ptrVal = MFI->getImagePSV(TM);
Info.align.reset();
} else {
- Info.ptrVal =
- MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+ Info.ptrVal = MFI->getBufferPSV(TM);
}
- Info.flags = MachineMemOperand::MODereferenceable;
+ Info.flags |= MachineMemOperand::MODereferenceable;
if (Attr.hasFnAttr(Attribute::ReadOnly)) {
unsigned DMaskLanes = 4;
@@ -1178,12 +1013,23 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
- Info.flags = MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable;
+ Info.flags |= MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable;
// XXX - Should this be volatile without known ordering?
Info.flags |= MachineMemOperand::MOVolatile;
+
+ switch (IntrID) {
+ default:
+ break;
+ case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+ return true;
+ }
+ }
}
return true;
}
@@ -1200,7 +1046,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
if (!Vol->isZero())
@@ -1211,12 +1057,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_buffer_atomic_fadd: {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTargetMachine());
+
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
- Info.ptrVal =
- MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+ Info.ptrVal = MFI->getBufferPSV(TM);
Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
if (!Vol || !Vol->isZero())
@@ -1230,7 +1078,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
if (!Vol->isZero())
@@ -1243,20 +1091,23 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MOVolatile;
+ Info.flags |= MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
- Info.ptrVal =
- MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTargetMachine());
+
+ Info.ptrVal = MFI->getImagePSV(TM);
Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad |
- MachineMemOperand::MODereferenceable;
+ Info.flags |= MachineMemOperand::MOLoad |
+ MachineMemOperand::MODereferenceable;
return true;
}
case Intrinsic::amdgcn_global_atomic_fadd:
@@ -1264,15 +1115,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
- case Intrinsic::amdgcn_flat_atomic_fmax: {
+ case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOVolatile;
+ Info.flags |= MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_ds_gws_init:
@@ -1283,18 +1136,29 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_ds_gws_sema_release_all: {
Info.opc = ISD::INTRINSIC_VOID;
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTargetMachine());
+
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- Info.ptrVal =
- MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+ Info.ptrVal = MFI->getGWSPSV(TM);
// This is an abstract access, but we need to specify a type and size.
Info.memVT = MVT::i32;
Info.size = 4;
Info.align = Align(4);
- Info.flags = MachineMemOperand::MOStore;
if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
- Info.flags = MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOLoad;
+ else
+ Info.flags |= MachineMemOperand::MOStore;
+ return true;
+ }
+ case Intrinsic::amdgcn_global_load_lds: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
+ MachineMemOperand::MOVolatile;
return true;
}
default:
@@ -1319,6 +1183,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_global_atomic_csub: {
Value *Ptr = II->getArgOperand(0);
AccessTy = II->getType();
@@ -1506,47 +1372,96 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
AddrSpace == AMDGPUAS::REGION_ADDRESS) {
// Check if alignment requirements for ds_read/write instructions are
// disabled.
- if (Subtarget->hasUnalignedDSAccessEnabled() &&
- !Subtarget->hasLDSMisalignedBug()) {
- if (IsFast)
- *IsFast = Alignment != Align(2);
- return true;
- }
+ if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
+ return false;
+
+ Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
+ if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
+ Alignment < RequiredAlignment)
+ return false;
// Either, the alignment requirements are "enabled", or there is an
// unaligned LDS access related hardware bug though alignment requirements
// are "disabled". In either case, we need to check for proper alignment
// requirements.
//
- if (Size == 64) {
+ switch (Size) {
+ case 64:
+ // SI has a hardware bug in the LDS / GDS bounds checking: if the base
+ // address is negative, then the instruction is incorrectly treated as
+ // out-of-bounds even if base + offsets is in bounds. Split vectorized
+ // loads here to avoid emitting ds_read2_b32. We may re-combine the
+ // load later in the SILoadStoreOptimizer.
+ if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
+ return false;
+
// 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
// can do a 4 byte aligned, 8 byte access in a single operation using
// ds_read2/write2_b32 with adjacent offsets.
- bool AlignedBy4 = Alignment >= Align(4);
- if (IsFast)
- *IsFast = AlignedBy4;
+ RequiredAlignment = Align(4);
+
+ if (Subtarget->hasUnalignedDSAccessEnabled()) {
+ // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
+ // ds_write2_b32 depending on the alignment. In either case with either
+ // alignment there is no faster way of doing this.
+ if (IsFast)
+ *IsFast = true;
+ return true;
+ }
+
+ break;
+ case 96:
+ if (!Subtarget->hasDS96AndDS128())
+ return false;
- return AlignedBy4;
- }
- if (Size == 96) {
// 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
// gfx8 and older.
- bool AlignedBy16 = Alignment >= Align(16);
- if (IsFast)
- *IsFast = AlignedBy16;
- return AlignedBy16;
- }
- if (Size == 128) {
+ if (Subtarget->hasUnalignedDSAccessEnabled()) {
+ // Naturally aligned access is fastest. However, also report it is Fast
+ // if memory is aligned less than DWORD. A narrow load or store will be
+ // be equally slow as a single ds_read_b96/ds_write_b96, but there will
+ // be more of them, so overall we will pay less penalty issuing a single
+ // instruction.
+ if (IsFast)
+ *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
+ return true;
+ }
+
+ break;
+ case 128:
+ if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
+ return false;
+
// 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
// gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
// single operation using ds_read2/write2_b64.
- bool AlignedBy8 = Alignment >= Align(8);
- if (IsFast)
- *IsFast = AlignedBy8;
+ RequiredAlignment = Align(8);
+
+ if (Subtarget->hasUnalignedDSAccessEnabled()) {
+ // Naturally aligned access is fastest. However, also report it is Fast
+ // if memory is aligned less than DWORD. A narrow load or store will be
+ // be equally slow as a single ds_read_b128/ds_write_b128, but there
+ // will be more of them, so overall we will pay less penalty issuing a
+ // single instruction.
+ if (IsFast)
+ *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
+ return true;
+ }
+
+ break;
+ default:
+ if (Size > 32)
+ return false;
- return AlignedBy8;
+ break;
}
+
+ if (IsFast)
+ *IsFast = Alignment >= RequiredAlignment;
+
+ return Alignment >= RequiredAlignment ||
+ Subtarget->hasUnalignedDSAccessEnabled();
}
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
@@ -1571,14 +1486,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
return AlignedBy4;
}
- if (Subtarget->hasUnalignedBufferAccessEnabled() &&
- !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
- // If we have an uniform constant load, it still requires using a slow
+ if (Subtarget->hasUnalignedBufferAccessEnabled()) {
+ // If we have a uniform constant load, it still requires using a slow
// buffer instruction if unaligned.
if (IsFast) {
// Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
- // 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
+ // 2-byte alignment is worse than 1 unless doing a 2-byte access.
*IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
Alignment >= Align(4) : Alignment != Align(2);
@@ -1603,20 +1516,22 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
bool SITargetLowering::allowsMisalignedMemoryAccesses(
EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
bool *IsFast) const {
- if (IsFast)
- *IsFast = false;
+ bool Allow = allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
+ Alignment, Flags, IsFast);
- // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
- // which isn't a simple VT.
- // Until MVT is extended to handle this, simply check for the size and
- // rely on the condition below: allow accesses if the size is a multiple of 4.
- if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
- VT.getStoreSize() > 16)) {
- return false;
+ if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() &&
+ (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
+ // Lie it is fast if +unaligned-access-mode is passed so that DS accesses
+ // get vectorized. We could use ds_read2_b*/ds_write2_b* instructions on a
+ // misaligned data which is faster than a pair of ds_read_b*/ds_write_b*
+ // which would be equally misaligned.
+ // This is only used by the common passes, selection always calls the
+ // allowsMisalignedMemoryAccessesImpl version.
+ *IsFast = true;
}
- return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
- Alignment, Flags, IsFast);
+ return Allow;
}
EVT SITargetLowering::getOptimalMemOpType(
@@ -1639,9 +1554,7 @@ EVT SITargetLowering::getOptimalMemOpType(
bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
- const Value *Ptr = MemNode->getMemOperand()->getValue();
- const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
- return I && I->getMetadata("amdgpu.noclobber");
+ return MemNode->getMemOperand()->getFlags() & MONoClobber;
}
bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
@@ -1681,6 +1594,15 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return true;
}
+bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+ unsigned Index) const {
+ if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+ return false;
+
+ // TODO: Add more cases that are cheap.
+ return Index == 0;
+}
+
bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
if (Subtarget->has16BitInsts() && VT == MVT::i16) {
switch (Op) {
@@ -2106,7 +2028,7 @@ void SITargetLowering::allocateSpecialInputSGPRs(
if (Info.hasDispatchPtr())
allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
- if (Info.hasQueuePtr())
+ if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
@@ -2153,7 +2075,7 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(DispatchPtrReg);
}
- if (Info.hasQueuePtr()) {
+ if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
@@ -2190,6 +2112,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
SIMachineFunctionInfo &Info,
CallingConv::ID CallConv,
bool IsShader) const {
+ if (Subtarget->hasUserSGPRInit16Bug()) {
+ // Pad up the used user SGPRs with dead inputs.
+ unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
+
+ // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
+ // rely on it to reach 16 since if we end up having no stack usage, it will
+ // not really be added.
+ unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
+ Info.hasWorkGroupIDY() +
+ Info.hasWorkGroupIDZ() +
+ Info.hasWorkGroupInfo();
+ for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
+ Register Reg = Info.addReservedUserSGPR();
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+ }
+
if (Info.hasWorkGroupIDX()) {
Register Reg = Info.addWorkGroupIDX();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
@@ -2234,6 +2174,8 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
}
+
+ assert(!Subtarget->hasUserSGPRInit16Bug() || Info.getNumPreloadedSGPRs() >= 16);
}
static void reservePrivateMemoryRegs(const TargetMachine &TM,
@@ -2388,7 +2330,7 @@ SDValue SITargetLowering::LowerFormalArguments(
return DAG.getEntryNode();
}
- Info->allocateModuleLDSGlobal(Fn.getParent());
+ Info->allocateModuleLDSGlobal(Fn);
SmallVector<ISD::InputArg, 16> Splits;
SmallVector<CCValAssign, 16> ArgLocs;
@@ -2538,7 +2480,13 @@ SDValue SITargetLowering::LowerFormalArguments(
assert(VA.isRegLoc() && "Parameter must be in a register!");
Register Reg = VA.getLocReg();
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+ const TargetRegisterClass *RC = nullptr;
+ if (AMDGPU::VGPR_32RegClass.contains(Reg))
+ RC = &AMDGPU::VGPR_32RegClass;
+ else if (AMDGPU::SGPR_32RegClass.contains(Reg))
+ RC = &AMDGPU::SGPR_32RegClass;
+ else
+ llvm_unreachable("Unexpected register class in LowerFormalArguments!");
EVT ValVT = VA.getValVT();
Reg = MF.addLiveIn(Reg, RC);
@@ -2657,24 +2605,6 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SmallVector<SDValue, 48> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
- // Add return address for callable functions.
- if (!Info->isEntryFunction()) {
- const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
- SDValue ReturnAddrReg = CreateLiveInRegister(
- DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
-
- SDValue ReturnAddrVirtualReg =
- DAG.getRegister(MF.getRegInfo().createVirtualRegister(
- CallConv != CallingConv::AMDGPU_Gfx
- ? &AMDGPU::CCR_SGPR_64RegClass
- : &AMDGPU::Gfx_CCR_SGPR_64RegClass),
- MVT::i64);
- Chain =
- DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
- Flag = Chain.getValue(1);
- RetOps.push_back(ReturnAddrVirtualReg);
- }
-
// Copy the result values into the output registers.
for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
++I, ++RealRVLocIdx) {
@@ -2731,15 +2661,8 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
RetOps.push_back(Flag);
unsigned Opc = AMDGPUISD::ENDPGM;
- if (!IsWaveEnd) {
- if (IsShader)
- Opc = AMDGPUISD::RETURN_TO_EPILOG;
- else if (CallConv == CallingConv::AMDGPU_Gfx)
- Opc = AMDGPUISD::RET_GFX_FLAG;
- else
- Opc = AMDGPUISD::RET_FLAG;
- }
-
+ if (!IsWaveEnd)
+ Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
@@ -3321,21 +3244,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
- SDValue PhysReturnAddrReg;
- if (IsTailCall) {
- // Since the return is being combined with the call, we need to pass on the
- // return address.
-
- const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
- SDValue ReturnAddrReg = CreateLiveInRegister(
- DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
-
- PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
- MVT::i64);
- Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
- InFlag = Chain.getValue(1);
- }
-
// We don't usually want to end the call-sequence here because we would tidy
// the frame up *after* the call, however in the ABI-changing tail-call case
// we've carefully laid out the parameters so that when sp is reset they'll be
@@ -3365,8 +3273,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// this information must travel along with the operation for eventual
// consumption by emitEpilogue.
Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
-
- Ops.push_back(PhysReturnAddrReg);
}
// Add argument registers to the end of the list so that they are known live
@@ -4104,6 +4010,21 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
+ MachineOperand &Dest = MI.getOperand(0);
+ MachineOperand &Src0 = MI.getOperand(1);
+ MachineOperand &Src1 = MI.getOperand(2);
+
+ if (IsAdd && ST.hasLshlAddB64()) {
+ auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
+ Dest.getReg())
+ .add(Src0)
+ .addImm(0)
+ .add(Src1);
+ TII->legalizeOperands(*Add);
+ MI.eraseFromParent();
+ return BB;
+ }
+
const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -4112,10 +4033,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
Register CarryReg = MRI.createVirtualRegister(CarryRC);
Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
- MachineOperand &Dest = MI.getOperand(0);
- MachineOperand &Src0 = MI.getOperand(1);
- MachineOperand &Src1 = MI.getOperand(2);
-
const TargetRegisterClass *Src0RC = Src0.isReg()
? MRI.getRegClass(Src0.getReg())
: &AMDGPU::VReg_64RegClass;
@@ -4390,29 +4307,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::DS_GWS_INIT:
case AMDGPU::DS_GWS_SEMA_BR:
case AMDGPU::DS_GWS_BARRIER:
- if (Subtarget->needsAlignedVGPRs()) {
- // Add implicit aligned super-reg to force alignment on the data operand.
- const DebugLoc &DL = MI.getDebugLoc();
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
- MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
- Register DataReg = Op->getReg();
- bool IsAGPR = TRI->isAGPR(MRI, DataReg);
- Register Undef = MRI.createVirtualRegister(
- IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
- BuildMI(*BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
- Register NewVR =
- MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
- : &AMDGPU::VReg_64_Align2RegClass);
- BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), NewVR)
- .addReg(DataReg, 0, Op->getSubReg())
- .addImm(AMDGPU::sub0)
- .addReg(Undef)
- .addImm(AMDGPU::sub1);
- Op->setReg(NewVR);
- Op->setSubReg(AMDGPU::sub0);
- MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
- }
+ TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
LLVM_FALLTHROUGH;
case AMDGPU::DS_GWS_SEMA_V:
case AMDGPU::DS_GWS_SEMA_P:
@@ -4500,6 +4395,18 @@ bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
return isTypeLegal(VT.getScalarType());
}
+bool SITargetLowering::hasAtomicFaddRtnForTy(SDValue &Op) const {
+ switch (Op.getValue(0).getSimpleValueType().SimpleTy) {
+ case MVT::f32:
+ return Subtarget->hasAtomicFaddRtnInsts();
+ case MVT::v2f16:
+ case MVT::f64:
+ return Subtarget->hasGFX90AInsts();
+ default:
+ return false;
+ }
+}
+
bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
// This currently forces unfolding various combinations of fsub into fma with
// free fneg'd operands. As long as we have fast FMA (controlled by
@@ -4560,7 +4467,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
// Otherwise f32 mad is always full rate and returns the same result as
// the separate operations so should be preferred over fma.
- // However does not support denomals.
+ // However does not support denormals.
if (hasFP32Denormals(MF))
return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
@@ -4653,8 +4560,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
- VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8f32 ||
- VT == MVT::v16f32 || VT == MVT::v32f32);
+ VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
+ VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+ VT == MVT::v32f32);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4676,8 +4584,9 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
- VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v8f32 ||
- VT == MVT::v16f32 || VT == MVT::v32f32);
+ VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
+ VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+ VT == MVT::v32f32);
SDValue Lo0, Hi0;
SDValue Op0 = Op.getOperand(0);
@@ -4738,10 +4647,30 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return lowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::SCALAR_TO_VECTOR:
+ return lowerSCALAR_TO_VECTOR(Op, DAG);
case ISD::BUILD_VECTOR:
return lowerBUILD_VECTOR(Op, DAG);
case ISD::FP_ROUND:
return lowerFP_ROUND(Op, DAG);
+ case ISD::FPTRUNC_ROUND: {
+ unsigned Opc;
+ SDLoc DL(Op);
+
+ if (Op.getOperand(0)->getValueType(0) != MVT::f32)
+ return SDValue();
+
+ // Get the rounding mode from the last operand
+ int RoundMode = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ if (RoundMode == (int)RoundingMode::TowardPositive)
+ Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD;
+ else if (RoundMode == (int)RoundingMode::TowardNegative)
+ Opc = AMDGPUISD::FPTRUNC_ROUND_DOWNWARD;
+ else
+ return SDValue();
+
+ return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
+ }
case ISD::TRAP:
return lowerTRAP(Op, DAG);
case ISD::DEBUGTRAP:
@@ -5356,7 +5285,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
if (IsIEEEMode)
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
- if (VT == MVT::v4f16 || VT == MVT::v8f16)
+ if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16)
return splitBinaryVectorOp(Op, DAG);
return Op;
}
@@ -5439,24 +5368,41 @@ SDValue SITargetLowering::lowerTrapEndpgm(
return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
}
+SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
+ const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ uint64_t Offset = getImplicitParameterOffset(MF, Param);
+ SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
+ MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+ return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+}
+
SDValue SITargetLowering::lowerTrapHsaQueuePtr(
SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
- MachineFunction &MF = DAG.getMachineFunction();
- SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- Register UserSGPR = Info->getQueuePtrUserSGPR();
-
SDValue QueuePtr;
- if (UserSGPR == AMDGPU::NoRegister) {
- // We probably are in a function incorrectly marked with
- // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the trap,
- // so just use a null pointer.
- QueuePtr = DAG.getConstant(0, SL, MVT::i64);
+ // For code object version 5, QueuePtr is passed through implicit kernarg.
+ if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+ QueuePtr =
+ loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
} else {
- QueuePtr = CreateLiveInRegister(
- DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ Register UserSGPR = Info->getQueuePtrUserSGPR();
+
+ if (UserSGPR == AMDGPU::NoRegister) {
+ // We probably are in a function incorrectly marked with
+ // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
+ // trap, so just use a null pointer.
+ QueuePtr = DAG.getConstant(0, SL, MVT::i64);
+ } else {
+ QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
+ MVT::i64);
+ }
}
SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
@@ -5532,6 +5478,14 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
}
+ // For code object version 5, private_base and shared_base are passed through
+ // implicit kernargs.
+ if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+ ImplicitParameter Param =
+ (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
+ return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
+ }
+
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Register UserSGPR = Info->getQueuePtrUserSGPR();
@@ -5691,14 +5645,11 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
EVT EltVT = VecVT.getVectorElementType();
unsigned VecSize = VecVT.getSizeInBits();
unsigned EltSize = EltVT.getSizeInBits();
+ SDLoc SL(Op);
-
- assert(VecSize <= 64);
-
+ // Specially handle the case of v4i16 with static indexing.
unsigned NumElts = VecVT.getVectorNumElements();
- SDLoc SL(Op);
auto KIdx = dyn_cast<ConstantSDNode>(Idx);
-
if (NumElts == 4 && EltSize == 16 && KIdx) {
SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
@@ -5726,35 +5677,41 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
}
+ // Static indexing does not lower to stack access, and hence there is no need
+ // for special custom lowering to avoid stack access.
if (isa<ConstantSDNode>(Idx))
return SDValue();
- MVT IntVT = MVT::getIntegerVT(VecSize);
-
- // Avoid stack access for dynamic indexing.
+ // Avoid stack access for dynamic indexing by custom lowering to
// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
- // Create a congruent vector with the target value in each element so that
- // the required element can be masked and ORed into the target vector.
- SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
- DAG.getSplatBuildVector(VecVT, SL, InsVal));
+ assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
+
+ MVT IntVT = MVT::getIntegerVT(VecSize);
+ // Convert vector index to bit-index and get the required bit mask.
assert(isPowerOf2_32(EltSize));
SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
-
- // Convert vector index to bit-index.
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
-
- SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
DAG.getConstant(0xffff, SL, IntVT),
ScaledIdx);
+ // 1. Create a congruent vector with the target value in each element.
+ SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
+ DAG.getSplatBuildVector(VecVT, SL, InsVal));
+
+ // 2. Mask off all other indicies except the required index within (1).
SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
+
+ // 3. Mask off the required index within the target vector.
+ SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
DAG.getNOT(SL, BFM, IntVT), BCVec);
+ // 4. Get (2) and (3) ORed into the target vector.
SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
+
return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
}
@@ -5778,17 +5735,35 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;
- if (VecSize == 128) {
+ if (VecSize == 128 || VecSize == 256) {
SDValue Lo, Hi;
EVT LoVT, HiVT;
- SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
- Lo =
- DAG.getBitcast(LoVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64,
- V2, DAG.getConstant(0, SL, MVT::i32)));
- Hi =
- DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64,
- V2, DAG.getConstant(1, SL, MVT::i32)));
+
+ if (VecSize == 128) {
+ SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
+ Lo = DAG.getBitcast(LoVT,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+ DAG.getConstant(0, SL, MVT::i32)));
+ Hi = DAG.getBitcast(HiVT,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+ DAG.getConstant(1, SL, MVT::i32)));
+ } else {
+ assert(VecSize == 256);
+
+ SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
+ SDValue Parts[4];
+ for (unsigned P = 0; P < 4; ++P) {
+ Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+ DAG.getConstant(P, SL, MVT::i32));
+ }
+
+ Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
+ Parts[0], Parts[1]));
+ Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
+ Parts[2], Parts[3]));
+ }
+
EVT IdxVT = Idx.getValueType();
unsigned NElem = VecVT.getVectorNumElements();
assert(isPowerOf2_32(NElem));
@@ -5800,10 +5775,19 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
assert(VecSize <= 64);
+ MVT IntVT = MVT::getIntegerVT(VecSize);
+
+ // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
+ SDValue VecBC = peekThroughBitcasts(Vec);
+ if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ SDValue Src = VecBC.getOperand(0);
+ Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
+ Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
+ }
+
unsigned EltSize = EltVT.getSizeInBits();
assert(isPowerOf2_32(EltSize));
- MVT IntVT = MVT::getIntegerVT(VecSize);
SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
// Convert vector index to bit-index (* EltSize)
@@ -5877,6 +5861,22 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
}
+SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue SVal = Op.getOperand(0);
+ EVT ResultVT = Op.getValueType();
+ EVT SValVT = SVal.getValueType();
+ SDValue UndefVal = DAG.getUNDEF(SValVT);
+ SDLoc SL(Op);
+
+ SmallVector<SDValue, 8> VElts;
+ VElts.push_back(SVal);
+ for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
+ VElts.push_back(UndefVal);
+
+ return DAG.getBuildVector(ResultVT, SL, VElts);
+}
+
SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc SL(Op);
@@ -5906,6 +5906,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}
+ if (VT == MVT::v16i16 || VT == MVT::v16f16) {
+ EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
+ VT.getVectorNumElements() / 4);
+ MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
+
+ SmallVector<SDValue, 4> Parts[4];
+ for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
+ for (unsigned P = 0; P < 4; ++P)
+ Parts[P].push_back(Op.getOperand(I + P * E));
+ }
+ SDValue Casts[4];
+ for (unsigned P = 0; P < 4; ++P) {
+ SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
+ Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
+ }
+
+ SDValue Blend =
+ DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
+ return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+ }
+
assert(VT == MVT::v2f16 || VT == MVT::v2i16);
assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
@@ -6277,6 +6298,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
unsigned IntrOpcode = Intr->BaseOpcode;
bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
+ bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
SmallVector<EVT, 3> ResultTypes(Op->values());
SmallVector<EVT, 3> OrigResultTypes(Op->values());
@@ -6455,6 +6477,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
//
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
+ //
+ // TODO: we can actually allow partial NSA where the final register is a
+ // contiguous set of the remaining addresses.
+ // This could help where there are more addresses than supported.
bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
VAddrs.size() >= 3 &&
VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
@@ -6561,7 +6587,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
int Opcode = -1;
- if (IsGFX10Plus) {
+ if (IsGFX11Plus) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
+ UseNSA ? AMDGPU::MIMGEncGfx11NSA
+ : AMDGPU::MIMGEncGfx11Default,
+ NumVDataDwords, NumVAddrDwords);
+ } else if (IsGFX10Plus) {
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
UseNSA ? AMDGPU::MIMGEncGfx10NSA
: AMDGPU::MIMGEncGfx10Default,
@@ -6685,6 +6716,32 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
return Loads[0];
}
+SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
+ unsigned Dim,
+ const ArgDescriptor &Arg) const {
+ SDLoc SL(Op);
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
+ if (MaxID == 0)
+ return DAG.getConstant(0, SL, MVT::i32);
+
+ SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
+ SDLoc(DAG.getEntryNode()), Arg);
+
+ // Don't bother inserting AssertZext for packed IDs since we're emitting the
+ // masking operations anyway.
+ //
+ // TODO: We could assert the top bit is 0 for the source copy.
+ if (Arg.isMasked())
+ return Val;
+
+ // Preserve the known bits after expansion to a copy.
+ EVT SmallVT =
+ EVT::getIntegerVT(*DAG.getContext(), 32 - countLeadingZeros(MaxID));
+ return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
+ DAG.getValueType(SmallVT));
+}
+
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
@@ -6831,26 +6888,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
case Intrinsic::amdgcn_workitem_id_x:
- if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0)
- return DAG.getConstant(0, DL, MVT::i32);
-
- return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
- SDLoc(DAG.getEntryNode()),
- MFI->getArgInfo().WorkItemIDX);
+ return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
case Intrinsic::amdgcn_workitem_id_y:
- if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0)
- return DAG.getConstant(0, DL, MVT::i32);
-
- return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
- SDLoc(DAG.getEntryNode()),
- MFI->getArgInfo().WorkItemIDY);
+ return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
case Intrinsic::amdgcn_workitem_id_z:
- if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0)
- return DAG.getConstant(0, DL, MVT::i32);
-
- return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
- SDLoc(DAG.getEntryNode()),
- MFI->getArgInfo().WorkItemIDZ);
+ return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
case Intrinsic::amdgcn_wavefrontsize:
return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
SDLoc(Op), MVT::i32);
@@ -7157,12 +7199,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
unsigned ShaderType =
SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction());
unsigned Offset0 = OrderedCountIndex << 2;
- unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
- (Instruction << 4);
+ unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
Offset1 |= (CountDw - 1) << 6;
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
+ Offset1 |= ShaderType << 2;
+
unsigned Offset = Offset0 | (Offset1 << 8);
SDValue Ops[] = {
@@ -7441,7 +7485,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
case Intrinsic::amdgcn_buffer_atomic_fadd:
- if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
+ if (!Op.getValue(0).use_empty() && !hasAtomicFaddRtnForTy(Op)) {
DiagnosticInfoUnsupported
NoFpRet(DAG.getMachineFunction().getFunction(),
"return versions of fp atomics not supported",
@@ -7609,12 +7653,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return SDValue();
}
+ const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
const bool Is64 = NodePtr.getValueType() == MVT::i64;
const unsigned NumVDataDwords = 4;
const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
- const bool UseNSA = Subtarget->hasNSAEncoding() &&
- NumVAddrDwords <= Subtarget->getNSAMaxSize();
+ const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
+ const bool UseNSA =
+ Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize();
const unsigned BaseOpcodes[2][2] = {
{AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
{AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
@@ -7622,12 +7668,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
int Opcode;
if (UseNSA) {
Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
- AMDGPU::MIMGEncGfx10NSA, NumVDataDwords,
- NumVAddrDwords);
+ IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
+ : AMDGPU::MIMGEncGfx10NSA,
+ NumVDataDwords, NumVAddrDwords);
} else {
- Opcode = AMDGPU::getMIMGOpcode(
- BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
- PowerOf2Ceil(NumVAddrDwords));
+ Opcode =
+ AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
+ IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default
+ : AMDGPU::MIMGEncGfx10Default,
+ NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
}
assert(Opcode != -1);
@@ -7660,15 +7709,36 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
};
- if (Is64)
- DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 2);
- else
+ if (UseNSA && IsGFX11Plus) {
Ops.push_back(NodePtr);
+ Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
+ Ops.push_back(RayOrigin);
+ if (IsA16) {
+ SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
+ DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
+ DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
+ for (unsigned I = 0; I < 3; ++I) {
+ MergedLanes.push_back(DAG.getBitcast(
+ MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
+ {DirLanes[I], InvDirLanes[I]})));
+ }
+ Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
+ } else {
+ Ops.push_back(RayDir);
+ Ops.push_back(RayInvDir);
+ }
+ } else {
+ if (Is64)
+ DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
+ 2);
+ else
+ Ops.push_back(NodePtr);
- Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
- packLanes(RayOrigin, true);
- packLanes(RayDir, true);
- packLanes(RayInvDir, false);
+ Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
+ packLanes(RayOrigin, true);
+ packLanes(RayDir, true);
+ packLanes(RayInvDir, false);
+ }
if (!UseNSA) {
// Build a single vector containing all the operands so far prepared.
@@ -7868,6 +7938,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
switch (IntrinsicID) {
case Intrinsic::amdgcn_exp_compr: {
+ if (!Subtarget->hasCompressedExport()) {
+ DiagnosticInfoUnsupported BadIntrin(
+ DAG.getMachineFunction().getFunction(),
+ "intrinsic not supported on subtarget", DL.getDebugLoc());
+ DAG.getContext()->diagnose(BadIntrin);
+ }
SDValue Src0 = Op.getOperand(4);
SDValue Src1 = Op.getOperand(5);
// Hack around illegal type on SI by directly selecting it.
@@ -8110,6 +8186,160 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
+ case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ unsigned Opc;
+ bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds;
+ unsigned OpOffset = HasVIndex ? 1 : 0;
+ SDValue VOffset = Op.getOperand(5 + OpOffset);
+ auto CVOffset = dyn_cast<ConstantSDNode>(VOffset);
+ bool HasVOffset = !CVOffset || !CVOffset->isZero();
+ unsigned Size = Op->getConstantOperandVal(4);
+
+ switch (Size) {
+ default:
+ return SDValue();
+ case 1:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
+ break;
+ case 2:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
+ break;
+ case 4:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
+ break;
+ }
+
+ SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+
+ SmallVector<SDValue, 8> Ops;
+
+ if (HasVIndex && HasVOffset)
+ Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
+ { Op.getOperand(5), // VIndex
+ VOffset }));
+ else if (HasVIndex)
+ Ops.push_back(Op.getOperand(5));
+ else if (HasVOffset)
+ Ops.push_back(VOffset);
+
+ Ops.push_back(Op.getOperand(2)); // rsrc
+ Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
+ Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
+ unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
+ Ops.push_back(
+ DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
+ Ops.push_back(
+ DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8)); // swz
+ Ops.push_back(M0Val.getValue(0)); // Chain
+ Ops.push_back(M0Val.getValue(1)); // Glue
+
+ auto *M = cast<MemSDNode>(Op);
+ MachineMemOperand *LoadMMO = M->getMemOperand();
+ MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+ LoadPtrI.Offset = Op->getConstantOperandVal(7 + OpOffset);
+ MachinePointerInfo StorePtrI = LoadPtrI;
+ StorePtrI.V = nullptr;
+ StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+
+ auto F = LoadMMO->getFlags() &
+ ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+ LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+ Size, LoadMMO->getBaseAlign());
+
+ MachineMemOperand *StoreMMO =
+ MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+ sizeof(int32_t), LoadMMO->getBaseAlign());
+
+ auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
+ DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+
+ return SDValue(Load, 0);
+ }
+ case Intrinsic::amdgcn_global_load_lds: {
+ unsigned Opc;
+ unsigned Size = Op->getConstantOperandVal(4);
+ switch (Size) {
+ default:
+ return SDValue();
+ case 1:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
+ break;
+ case 2:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
+ break;
+ case 4:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
+ break;
+ }
+
+ auto *M = cast<MemSDNode>(Op);
+ SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+
+ SmallVector<SDValue, 6> Ops;
+
+ SDValue Addr = Op.getOperand(2); // Global ptr
+ SDValue VOffset;
+ // Try to split SAddr and VOffset. Global and LDS pointers share the same
+ // immediate offset, so we cannot use a regular SelectGlobalSAddr().
+ if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
+ SDValue LHS = Addr.getOperand(0);
+ SDValue RHS = Addr.getOperand(1);
+
+ if (LHS->isDivergent())
+ std::swap(LHS, RHS);
+
+ if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getOperand(0).getValueType() == MVT::i32) {
+ // add (i64 sgpr), (zero_extend (i32 vgpr))
+ Addr = LHS;
+ VOffset = RHS.getOperand(0);
+ }
+ }
+
+ Ops.push_back(Addr);
+ if (!Addr->isDivergent()) {
+ Opc = AMDGPU::getGlobalSaddrOp(Opc);
+ if (!VOffset)
+ VOffset = SDValue(
+ DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
+ DAG.getTargetConstant(0, DL, MVT::i32)), 0);
+ Ops.push_back(VOffset);
+ }
+
+ Ops.push_back(Op.getOperand(5)); // Offset
+ Ops.push_back(Op.getOperand(6)); // CPol
+ Ops.push_back(M0Val.getValue(0)); // Chain
+ Ops.push_back(M0Val.getValue(1)); // Glue
+
+ MachineMemOperand *LoadMMO = M->getMemOperand();
+ MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+ LoadPtrI.Offset = Op->getConstantOperandVal(5);
+ MachinePointerInfo StorePtrI = LoadPtrI;
+ LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
+ StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+ auto F = LoadMMO->getFlags() &
+ ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+ LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+ Size, LoadMMO->getBaseAlign());
+ MachineMemOperand *StoreMMO =
+ MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+ sizeof(int32_t), Align(4));
+
+ auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+ DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+
+ return SDValue(Load, 0);
+ }
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain), 0);
@@ -8271,7 +8501,7 @@ static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
- if (Ld->getAlignment() < 4 || Ld->isDivergent())
+ if (Ld->getAlign() < Align(4) || Ld->isDivergent())
return SDValue();
// FIXME: Constant loads should all be marked invariant.
@@ -8296,14 +8526,11 @@ SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const
// TODO: Drop only high part of range.
SDValue Ptr = Ld->getBasePtr();
- SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
- MVT::i32, SL, Ld->getChain(), Ptr,
- Ld->getOffset(),
- Ld->getPointerInfo(), MVT::i32,
- Ld->getAlignment(),
- Ld->getMemOperand()->getFlags(),
- Ld->getAAInfo(),
- nullptr); // Drop ranges
+ SDValue NewLoad = DAG.getLoad(
+ ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
+ Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
+ Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
+ nullptr); // Drop ranges
EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
if (MemVT.isFloatingPoint()) {
@@ -8392,17 +8619,16 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
"Custom lowering for non-i32 vectors hasn't been implemented.");
- unsigned Alignment = Load->getAlignment();
+ Align Alignment = Load->getAlign();
unsigned AS = Load->getAddressSpace();
- if (Subtarget->hasLDSMisalignedBug() &&
- AS == AMDGPUAS::FLAT_ADDRESS &&
- Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
+ if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
+ Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
return SplitVectorLoad(Op, DAG);
}
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- // If there is a possibilty that flat instruction access scratch memory
+ // If there is a possibility that flat instruction access scratch memory
// then we need to use the same legalization rules we use for private.
if (AS == AMDGPUAS::FLAT_ADDRESS &&
!Subtarget->hasMultiDwordFlatScratchAddressing())
@@ -8413,7 +8639,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
- if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
+ if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
if (MemVT.isPow2VectorType())
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
@@ -8429,7 +8655,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::GLOBAL_ADDRESS) {
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
- Alignment >= 4 && NumElements < 32) {
+ Alignment >= Align(4) && NumElements < 32) {
if (MemVT.isPow2VectorType())
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
@@ -8479,27 +8705,15 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- // Use ds_read_b128 or ds_read_b96 when possible.
- if (Subtarget->hasDS96AndDS128() &&
- ((Subtarget->useDS128() && MemVT.getStoreSize() == 16) ||
- MemVT.getStoreSize() == 12) &&
- allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
- Load->getAlign()))
+ bool Fast = false;
+ auto Flags = Load->getMemOperand()->getFlags();
+ if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
+ Load->getAlign(), Flags, &Fast) &&
+ Fast)
return SDValue();
- if (NumElements > 2)
+ if (MemVT.isVector())
return SplitVectorLoad(Op, DAG);
-
- // SI has a hardware bug in the LDS / GDS boounds checking: if the base
- // address is negative, then the instruction is incorrectly treated as
- // out-of-bounds even if base + offsets is in bounds. Split vectorized
- // loads here to avoid emitting ds_read2_b32. We may re-combine the
- // load later in the SILoadStoreOptimizer.
- if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
- NumElements == 2 && MemVT.getStoreSize() == 8 &&
- Load->getAlignment() < 8) {
- return SplitVectorLoad(Op, DAG);
- }
}
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
@@ -8514,7 +8728,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
- if (VT.getSizeInBits() == 128)
+ if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
return splitTernaryVectorOp(Op, DAG);
assert(VT.getSizeInBits() == 64);
@@ -8946,13 +9160,13 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
unsigned AS = Store->getAddressSpace();
if (Subtarget->hasLDSMisalignedBug() &&
AS == AMDGPUAS::FLAT_ADDRESS &&
- Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
+ Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
return SplitVectorStore(Op, DAG);
}
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- // If there is a possibilty that flat instruction access scratch memory
+ // If there is a possibility that flat instruction access scratch memory
// then we need to use the same legalization rules we use for private.
if (AS == AMDGPUAS::FLAT_ADDRESS &&
!Subtarget->hasMultiDwordFlatScratchAddressing())
@@ -8990,39 +9204,21 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- // Use ds_write_b128 or ds_write_b96 when possible.
- if (Subtarget->hasDS96AndDS128() &&
- ((Subtarget->useDS128() && VT.getStoreSize() == 16) ||
- (VT.getStoreSize() == 12)) &&
- allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
- Store->getAlign()))
+ bool Fast = false;
+ auto Flags = Store->getMemOperand()->getFlags();
+ if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
+ Store->getAlign(), Flags, &Fast) &&
+ Fast)
return SDValue();
- if (NumElements > 2)
+ if (VT.isVector())
return SplitVectorStore(Op, DAG);
- // SI has a hardware bug in the LDS / GDS boounds checking: if the base
- // address is negative, then the instruction is incorrectly treated as
- // out-of-bounds even if base + offsets is in bounds. Split vectorized
- // stores here to avoid emitting ds_write2_b32. We may re-combine the
- // store later in the SILoadStoreOptimizer.
- if (!Subtarget->hasUsableDSOffset() &&
- NumElements == 2 && VT.getStoreSize() == 8 &&
- Store->getAlignment() < 8) {
- return SplitVectorStore(Op, DAG);
- }
-
- if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- VT, *Store->getMemOperand())) {
- if (VT.isVector())
- return SplitVectorStore(Op, DAG);
- return expandUnalignedStore(Store, DAG);
- }
-
- return SDValue();
- } else {
- llvm_unreachable("unhandled address space");
+ return expandUnalignedStore(Store, DAG);
}
+
+ // Probably an invalid store. If so we'll end up emitting a selection error.
+ return SDValue();
}
SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
@@ -10041,7 +10237,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
}
}
- // If one half is undef, and one is constant, perfer a splat vector rather
+ // If one half is undef, and one is constant, prefer a splat vector rather
// than the normal qNaN. If it's a register, prefer 0.0 since that's
// cheaper to use and may be free with a packed operation.
if (NewElts[0].isUndef()) {
@@ -10349,7 +10545,8 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
// expanded into a set of cmp/select instructions.
bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
unsigned NumElem,
- bool IsDivergentIdx) {
+ bool IsDivergentIdx,
+ const GCNSubtarget *Subtarget) {
if (UseDivergentRegisterIndexing)
return false;
@@ -10371,10 +10568,18 @@ bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
// Large vectors would yield too many compares and v_cndmask_b32 instructions.
unsigned NumInsts = NumElem /* Number of compares */ +
((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
- return NumInsts <= 16;
+
+ // On some architectures (GFX9) movrel is not available and it's better
+ // to expand.
+ if (!Subtarget->hasMovrel())
+ return NumInsts <= 16;
+
+ // If movrel is available, use it instead of expanding for vector of 8
+ // elements.
+ return NumInsts <= 15;
}
-static bool shouldExpandVectorDynExt(SDNode *N) {
+bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
SDValue Idx = N->getOperand(N->getNumOperands() - 1);
if (isa<ConstantSDNode>(Idx))
return false;
@@ -10385,8 +10590,8 @@ static bool shouldExpandVectorDynExt(SDNode *N) {
unsigned EltSize = EltVT.getSizeInBits();
unsigned NumElem = VecVT.getVectorNumElements();
- return SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
- Idx->isDivergent());
+ return SITargetLowering::shouldExpandVectorDynExt(
+ EltSize, NumElem, Idx->isDivergent(), getSubtarget());
}
SDValue SITargetLowering::performExtractVectorEltCombine(
@@ -10450,7 +10655,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
unsigned EltSize = EltVT.getSizeInBits();
// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
- if (::shouldExpandVectorDynExt(N)) {
+ if (shouldExpandVectorDynExt(N)) {
SDLoc SL(N);
SDValue Idx = N->getOperand(1);
SDValue V;
@@ -10513,7 +10718,7 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
// INSERT_VECTOR_ELT (<n x e>, var-idx)
// => BUILD_VECTOR n x select (e, const-idx)
- if (!::shouldExpandVectorDynExt(N))
+ if (!shouldExpandVectorDynExt(N))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -10603,39 +10808,145 @@ static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
}
-SDValue SITargetLowering::performAddCombine(SDNode *N,
+// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
+// multiplies, if any.
+//
+// Full 64-bit multiplies that feed into an addition are lowered here instead
+// of using the generic expansion. The generic expansion ends up with
+// a tree of ADD nodes that prevents us from using the "add" part of the
+// MAD instruction. The expansion produced here results in a chain of ADDs
+// instead of a tree.
+SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
DAGCombinerInfo &DCI) const {
+ assert(N->getOpcode() == ISD::ADD);
+
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
SDLoc SL(N);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
- if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
- && Subtarget->hasMad64_32() &&
- !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
- VT.getScalarSizeInBits() <= 64) {
- if (LHS.getOpcode() != ISD::MUL)
- std::swap(LHS, RHS);
+ if (VT.isVector())
+ return SDValue();
+
+ // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
+ // result in scalar registers for uniform values.
+ if (!N->isDivergent() && Subtarget->hasSMulHi())
+ return SDValue();
+
+ unsigned NumBits = VT.getScalarSizeInBits();
+ if (NumBits <= 32 || NumBits > 64)
+ return SDValue();
+
+ if (LHS.getOpcode() != ISD::MUL) {
+ assert(RHS.getOpcode() == ISD::MUL);
+ std::swap(LHS, RHS);
+ }
+
+ // Avoid the fold if it would unduly increase the number of multiplies due to
+ // multiple uses, except on hardware with full-rate multiply-add (which is
+ // part of full-rate 64-bit ops).
+ if (!Subtarget->hasFullRate64Ops()) {
+ unsigned NumUsers = 0;
+ for (SDNode *Use : LHS->uses()) {
+ // There is a use that does not feed into addition, so the multiply can't
+ // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
+ if (Use->getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
+ // MUL + 3xADD + 3xADDC over 3xMAD.
+ ++NumUsers;
+ if (NumUsers >= 3)
+ return SDValue();
+ }
+ }
+
+ SDValue MulLHS = LHS.getOperand(0);
+ SDValue MulRHS = LHS.getOperand(1);
+ SDValue AddRHS = RHS;
+
+ // Always check whether operands are small unsigned values, since that
+ // knowledge is useful in more cases. Check for small signed values only if
+ // doing so can unlock a shorter code sequence.
+ bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
+ bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
+
+ bool MulSignedLo = false;
+ if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
+ MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
+ numBitsSigned(MulRHS, DAG) <= 32;
+ }
+
+ // The operands and final result all have the same number of bits. If
+ // operands need to be extended, they can be extended with garbage. The
+ // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
+ // truncated away in the end.
+ if (VT != MVT::i64) {
+ MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
+ MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
+ AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
+ }
+
+ // The basic code generated is conceptually straightforward. Pseudo code:
+ //
+ // accum = mad_64_32 lhs.lo, rhs.lo, accum
+ // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
+ // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
+ //
+ // The second and third lines are optional, depending on whether the factors
+ // are {sign,zero}-extended or not.
+ //
+ // The actual DAG is noisier than the pseudo code, but only due to
+ // instructions that disassemble values into low and high parts, and
+ // assemble the final result.
+ SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+ auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
+ auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
+ SDValue Accum =
+ getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
- SDValue MulLHS = LHS.getOperand(0);
- SDValue MulRHS = LHS.getOperand(1);
- SDValue AddRHS = RHS;
+ if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
+ auto AccumLo = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, Zero);
+ auto AccumHi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, One);
- // TODO: Maybe restrict if SGPR inputs.
- if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
- numBitsUnsigned(MulRHS, DAG) <= 32) {
- MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
- MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
- AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
- return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
+ if (!MulLHSUnsigned32) {
+ auto MulLHSHi =
+ DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
+ SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
+ AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
}
- if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) {
- MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
- MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
- AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
- return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
+ if (!MulRHSUnsigned32) {
+ auto MulRHSHi =
+ DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
+ SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
+ AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
+ }
+
+ Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
+ Accum = DAG.getBitcast(MVT::i64, Accum);
+ }
+
+ if (VT != MVT::i64)
+ Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
+ return Accum;
+}
+
+SDValue SITargetLowering::performAddCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDLoc SL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
+ if (Subtarget->hasMad64_32()) {
+ if (SDValue Folded = tryFoldToMad64_32(N, DCI))
+ return Folded;
}
return SDValue();
@@ -10763,7 +11074,7 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N,
SDValue RHS = N->getOperand(1);
// These should really be instruction patterns, but writing patterns with
- // source modiifiers is a pain.
+ // source modifiers is a pain.
// fadd (fadd (a, a), b) -> mad 2.0, a, b
if (LHS.getOpcode() == ISD::FADD) {
@@ -10860,8 +11171,8 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
return SDValue();
// fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
- // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
- // is sufficient to allow generaing fdot2.
+ // regardless of the denorm mode setting. Therefore,
+ // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
const TargetOptions &Options = DAG.getTarget().Options;
if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
(N->getFlags().hasAllowContract() &&
@@ -11562,7 +11873,7 @@ void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
if (DstSize < InitIdx)
return;
- // Create a register for the intialization value.
+ // Create a register for the initialization value.
Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
unsigned NewDst = 0; // Final initialized value will be in here
@@ -11608,7 +11919,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
TII->legalizeOperandsVOP3(MRI, MI);
// Prefer VGPRs over AGPRs in mAI instructions where possible.
- // This saves a chain-copy of registers and better ballance register
+ // This saves a chain-copy of registers and better balance register
// use between vgpr and agpr as agpr tuples tend to be big.
if (MI.getDesc().OpInfo) {
unsigned Opc = MI.getOpcode();
@@ -11633,54 +11944,29 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
// so no use checks are needed.
MRI.setRegClass(Op.getReg(), NewRC);
}
- }
-
- return;
- }
- // Replace unused atomics with the no return version.
- int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
- if (NoRetAtomicOp != -1) {
- if (!Node->hasAnyUseOfValue(0)) {
- int CPolIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::cpol);
- if (CPolIdx != -1) {
- MachineOperand &CPol = MI.getOperand(CPolIdx);
- CPol.setImm(CPol.getImm() & ~AMDGPU::CPol::GLC);
+ // Resolve the rest of AV operands to AGPRs.
+ if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
+ if (Src2->isReg() && Src2->getReg().isVirtual()) {
+ auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
+ if (TRI->isVectorSuperClass(RC)) {
+ auto *NewRC = TRI->getEquivalentAGPRClass(RC);
+ MRI.setRegClass(Src2->getReg(), NewRC);
+ if (Src2->isTied())
+ MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
+ }
+ }
}
- MI.RemoveOperand(0);
- MI.setDesc(TII->get(NoRetAtomicOp));
- return;
}
- // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
- // instruction, because the return type of these instructions is a vec2 of
- // the memory type, so it can be tied to the input operand.
- // This means these instructions always have a use, so we need to add a
- // special case to check if the atomic has only one extract_subreg use,
- // which itself has no uses.
- if ((Node->hasNUsesOfValue(1, 0) &&
- Node->use_begin()->isMachineOpcode() &&
- Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
- !Node->use_begin()->hasAnyUseOfValue(0))) {
- Register Def = MI.getOperand(0).getReg();
-
- // Change this into a noret atomic.
- MI.setDesc(TII->get(NoRetAtomicOp));
- MI.RemoveOperand(0);
-
- // If we only remove the def operand from the atomic instruction, the
- // extract_subreg will be left with a use of a vreg without a def.
- // So we need to insert an implicit_def to avoid machine verifier
- // errors.
- BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
- TII->get(AMDGPU::IMPLICIT_DEF), Def);
- }
return;
}
- if (TII->isMIMG(MI) && !MI.mayStore())
- AddIMGInit(MI);
+ if (TII->isMIMG(MI)) {
+ if (!MI.mayStore())
+ AddIMGInit(MI);
+ TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
+ }
}
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
@@ -12243,13 +12529,17 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
MachineBasicBlock *Exit = ML->getExitBlock();
if (Pre && Exit) {
- BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
- TII->get(AMDGPU::S_INST_PREFETCH))
- .addImm(1); // prefetch 2 lines behind PC
+ auto PreTerm = Pre->getFirstTerminator();
+ if (PreTerm == Pre->begin() ||
+ std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
+ BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
+ .addImm(1); // prefetch 2 lines behind PC
- BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
- TII->get(AMDGPU::S_INST_PREFETCH))
- .addImm(2); // prefetch 1 line behind PC
+ auto ExitHead = Exit->getFirstNonDebugInstr();
+ if (ExitHead == Exit->end() ||
+ ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
+ BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
+ .addImm(2); // prefetch 1 line behind PC
}
return CacheLineAlign;
@@ -12390,6 +12680,9 @@ static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+ unsigned AS = RMW->getPointerAddressSpace();
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS)
+ return AtomicExpansionKind::NotAtomic;
auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) {
OptimizationRemarkEmitter ORE(RMW->getFunction());
@@ -12421,10 +12714,11 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
return AtomicExpansionKind::CmpXChg;
- unsigned AS = RMW->getPointerAddressSpace();
-
if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
- Subtarget->hasAtomicFaddInsts()) {
+ Subtarget->hasAtomicFaddNoRtnInsts()) {
+ if (Subtarget->hasGFX940Insts())
+ return AtomicExpansionKind::None;
+
// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
// floating point atomic instructions. May generate more efficient code,
// but may not respect rounding and denormal modes, and may give incorrect
@@ -12453,8 +12747,8 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
: AtomicExpansionKind::CmpXChg;
}
- // DS FP atomics do repect the denormal mode, but the rounding mode is fixed
- // to round-to-nearest-even.
+ // DS FP atomics do respect the denormal mode, but the rounding mode is
+ // fixed to round-to-nearest-even.
// The only exception is DS_ADD_F64 which never flushes regardless of mode.
if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
if (!Ty->isDoubleTy())
@@ -12479,6 +12773,27 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
}
+TargetLowering::AtomicExpansionKind
+SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+ return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
+ ? AtomicExpansionKind::NotAtomic
+ : AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+ return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
+ ? AtomicExpansionKind::NotAtomic
+ : AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
+ return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
+ ? AtomicExpansionKind::NotAtomic
+ : AtomicExpansionKind::None;
+}
+
const TargetRegisterClass *
SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
@@ -12500,7 +12815,7 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
// always uniform.
static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
unsigned WaveSize) {
- // FIXME: We asssume we never cast the mask results of a control flow
+ // FIXME: We assume we never cast the mask results of a control flow
// intrinsic.
// Early exit if the type won't be consistent as a compile time hack.
IntegerType *IT = dyn_cast<IntegerType>(V->getType());
@@ -12604,7 +12919,7 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
SDValue N1) const {
if (!N0.hasOneUse())
return false;
- // Take care of the oportunity to keep N0 uniform
+ // Take care of the opportunity to keep N0 uniform
if (N0->isDivergent() || !N1->isDivergent())
return true;
// Check if we have a good chance to form the memory access pattern with the
@@ -12612,3 +12927,11 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
return (DAG.isBaseWithConstantOffset(N0) &&
hasMemSDNodeUser(*N0->use_begin()));
}
+
+MachineMemOperand::Flags
+SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
+ // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
+ if (I.getMetadata("amdgpu.noclobber"))
+ return MONoClobber;
+ return MachineMemOperand::MONone;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index bf81e082b478..4fbccf0c5850 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -53,6 +53,9 @@ private:
uint64_t Offset, Align Alignment,
bool Signed,
const ISD::InputArg *Arg = nullptr) const;
+ SDValue loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, const SDLoc &DL,
+ Align Alignment,
+ ImplicitParameter Param) const;
SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
const SDLoc &SL, SDValue Chain,
@@ -76,6 +79,9 @@ private:
SDValue lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
unsigned NewOpcode) const;
+ SDValue lowerWorkitemID(SelectionDAG &DAG, SDValue Op, unsigned Dim,
+ const ArgDescriptor &ArgDesc) const;
+
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
@@ -145,6 +151,7 @@ private:
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
@@ -191,6 +198,7 @@ private:
SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
+ SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -227,7 +235,10 @@ public:
/// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
/// expanded into a set of cmp/select instructions.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem,
- bool IsDivergentIdx);
+ bool IsDivergentIdx,
+ const GCNSubtarget *Subtarget);
+
+ bool shouldExpandVectorDynExt(SDNode *N) const;
private:
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
@@ -310,6 +321,9 @@ public:
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
+ bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+ unsigned Index) const override;
+
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override;
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
@@ -380,6 +394,7 @@ public:
MachineBasicBlock *BB) const override;
bool hasBitPreservingFPLogic(EVT VT) const override;
+ bool hasAtomicFaddRtnForTy(SDValue &Op) const;
bool enableAggressiveFMAFusion(EVT VT) const override;
bool enableAggressiveFMAFusion(LLT Ty) const override;
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
@@ -466,6 +481,10 @@ public:
bool SNaN = false,
unsigned Depth = 0) const override;
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
+ AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+ AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+ AtomicExpansionKind
+ shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
virtual const TargetRegisterClass *
getRegClassFor(MVT VT, bool isDivergent) const override;
@@ -505,6 +524,9 @@ public:
std::pair<InstructionCost, MVT> getTypeLegalizationCost(const DataLayout &DL,
Type *Ty) const;
+
+ MachineMemOperand::Flags
+ getTargetMMOFlags(const Instruction &I) const override;
};
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 125f006a1d1d..50f8ad4433c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -35,6 +35,7 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
using namespace llvm;
@@ -42,11 +43,39 @@ using namespace llvm;
namespace {
+// A clause length of 64 instructions could be encoded in the s_clause
+// instruction, but the hardware documentation (at least for GFX11) says that
+// 63 is the maximum allowed.
+constexpr unsigned MaxInstructionsInClause = 63;
+
enum HardClauseType {
+ // For GFX10:
+
// Texture, buffer, global or scratch memory instructions.
HARDCLAUSE_VMEM,
// Flat (not global or scratch) memory instructions.
HARDCLAUSE_FLAT,
+
+ // For GFX11:
+
+ // Texture memory instructions.
+ HARDCLAUSE_MIMG_LOAD,
+ HARDCLAUSE_MIMG_STORE,
+ HARDCLAUSE_MIMG_ATOMIC,
+ HARDCLAUSE_MIMG_SAMPLE,
+ // Buffer, global or scratch memory instructions.
+ HARDCLAUSE_VMEM_LOAD,
+ HARDCLAUSE_VMEM_STORE,
+ HARDCLAUSE_VMEM_ATOMIC,
+ // Flat (not global or scratch) memory instructions.
+ HARDCLAUSE_FLAT_LOAD,
+ HARDCLAUSE_FLAT_STORE,
+ HARDCLAUSE_FLAT_ATOMIC,
+ // BVH instructions.
+ HARDCLAUSE_BVH,
+
+ // Common:
+
// Instructions that access LDS.
HARDCLAUSE_LDS,
// Scalar memory instructions.
@@ -78,19 +107,43 @@ public:
}
HardClauseType getHardClauseType(const MachineInstr &MI) {
-
- // On current architectures we only get a benefit from clausing loads.
- if (MI.mayLoad()) {
- if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) {
- if (ST->hasNSAClauseBug()) {
+ if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) {
+ if (ST->getGeneration() == AMDGPUSubtarget::GFX10) {
+ if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) {
+ if (ST->hasNSAClauseBug()) {
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
+ if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA)
+ return HARDCLAUSE_ILLEGAL;
+ }
+ return HARDCLAUSE_VMEM;
+ }
+ if (SIInstrInfo::isFLAT(MI))
+ return HARDCLAUSE_FLAT;
+ } else {
+ assert(ST->getGeneration() >= AMDGPUSubtarget::GFX11);
+ if (SIInstrInfo::isMIMG(MI)) {
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
- if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA)
- return HARDCLAUSE_ILLEGAL;
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
+ AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+ if (BaseInfo->BVH)
+ return HARDCLAUSE_BVH;
+ if (BaseInfo->Sampler)
+ return HARDCLAUSE_MIMG_SAMPLE;
+ return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_MIMG_ATOMIC
+ : HARDCLAUSE_MIMG_LOAD
+ : HARDCLAUSE_MIMG_STORE;
+ }
+ if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) {
+ return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_VMEM_ATOMIC
+ : HARDCLAUSE_VMEM_LOAD
+ : HARDCLAUSE_VMEM_STORE;
+ }
+ if (SIInstrInfo::isFLAT(MI)) {
+ return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_FLAT_ATOMIC
+ : HARDCLAUSE_FLAT_LOAD
+ : HARDCLAUSE_FLAT_STORE;
}
- return HARDCLAUSE_VMEM;
}
- if (SIInstrInfo::isFLAT(MI))
- return HARDCLAUSE_FLAT;
// TODO: LDS
if (SIInstrInfo::isSMRD(MI))
return HARDCLAUSE_SMEM;
@@ -129,7 +182,7 @@ public:
bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
if (CI.First == CI.Last)
return false;
- assert(CI.Length <= 64 && "Hard clause is too long!");
+ assert(CI.Length <= MaxInstructionsInClause && "Hard clause is too long!");
auto &MBB = *CI.First->getParent();
auto ClauseMI =
@@ -170,7 +223,7 @@ public:
}
}
- if (CI.Length == 64 ||
+ if (CI.Length == MaxInstructionsInClause ||
(CI.Length && Type != HARDCLAUSE_INTERNAL &&
Type != HARDCLAUSE_IGNORE &&
(Type != CI.Type ||
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index f8a10bc8ef6f..349bcbf82195 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/Sequence.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/DebugCounter.h"
@@ -87,29 +88,29 @@ struct RegisterEncoding {
};
enum WaitEventType {
- VMEM_ACCESS, // vector-memory read & write
- VMEM_READ_ACCESS, // vector-memory read
- VMEM_WRITE_ACCESS,// vector-memory write
- LDS_ACCESS, // lds read & write
- GDS_ACCESS, // gds read & write
- SQ_MESSAGE, // send message
- SMEM_ACCESS, // scalar-memory read & write
- EXP_GPR_LOCK, // export holding on its data src
- GDS_GPR_LOCK, // GDS holding on its data and addr src
- EXP_POS_ACCESS, // write to export position
- EXP_PARAM_ACCESS, // write to export parameter
- VMW_GPR_LOCK, // vector-memory write holding on its data src
+ VMEM_ACCESS, // vector-memory read & write
+ VMEM_READ_ACCESS, // vector-memory read
+ VMEM_WRITE_ACCESS, // vector-memory write
+ LDS_ACCESS, // lds read & write
+ GDS_ACCESS, // gds read & write
+ SQ_MESSAGE, // send message
+ SMEM_ACCESS, // scalar-memory read & write
+ EXP_GPR_LOCK, // export holding on its data src
+ GDS_GPR_LOCK, // GDS holding on its data and addr src
+ EXP_POS_ACCESS, // write to export position
+ EXP_PARAM_ACCESS, // write to export parameter
+ VMW_GPR_LOCK, // vector-memory write holding on its data src
+ EXP_LDS_ACCESS, // read by ldsdir counting as export
NUM_WAIT_EVENTS,
};
static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
- (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
- (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
- (1 << SQ_MESSAGE),
- (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
- (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
- (1 << VMEM_WRITE_ACCESS)
-};
+ (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
+ (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
+ (1 << SQ_MESSAGE),
+ (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+ (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
+ (1 << VMEM_WRITE_ACCESS)};
// The mapping is:
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
@@ -119,10 +120,10 @@ static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
enum RegisterMapping {
SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
- AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets.
+ AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
- EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
+ EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
};
@@ -355,6 +356,8 @@ private:
DenseSet<MachineInstr *> TrackedWaitcntSet;
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
+ DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
+ MachineLoopInfo *MLI;
MachinePostDominatorTree *PDT;
struct BlockInfo {
@@ -381,6 +384,9 @@ public:
(void)ForceVMCounter;
}
+ bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
+ bool isPreheaderToFlush(MachineBasicBlock &MBB,
+ WaitcntBrackets &ScoreBrackets);
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override {
@@ -389,6 +395,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<MachineLoopInfo>();
AU.addRequired<MachinePostDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -431,14 +438,23 @@ public:
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr);
+ MachineInstr *OldWaitcntInstr,
+ bool FlushVmCnt);
+ bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
+ WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr);
+ bool generateWaitcnt(AMDGPU::Waitcnt Wait,
+ MachineBasicBlock::instr_iterator It,
+ MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr);
void updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
MachineInstr &OldWaitcntInstr,
- AMDGPU::Waitcnt &Wait, const MachineInstr *MI);
+ AMDGPU::Waitcnt &Wait,
+ MachineBasicBlock::instr_iterator It);
};
} // end anonymous namespace
@@ -496,6 +512,14 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
}
}
+// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
+// can be accessed. A load from LDS to VMEM does not need a wait.
+static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) &&
+ (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)) &&
+ MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
+}
+
void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI,
@@ -588,6 +612,12 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
CurrScore);
}
+ } else if (TII->isLDSDIR(Inst)) {
+ // LDSDIR instructions attach the score to the destination.
+ setExpScore(
+ &Inst, TII, TRI, MRI,
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
+ CurrScore);
} else {
if (TII->isEXP(Inst)) {
// For export the destination registers are really temps that
@@ -644,7 +674,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
setRegScore(RegNo, T, CurrScore);
}
}
- if (TII->isDS(Inst) && Inst.mayStore()) {
+ if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
}
}
@@ -784,6 +814,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
@@ -796,53 +827,53 @@ FunctionPass *llvm::createSIInsertWaitcntsPass() {
return new SIInsertWaitcnts();
}
-/// Combine consecutive waitcnt instructions that precede \p MI and follow
+/// Combine consecutive waitcnt instructions that precede \p It and follow
/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
/// by previous passes. Currently this pass conservatively assumes that these
/// preexisting waitcnt are required for correctness.
-bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
- MachineInstr &OldWaitcntInstr,
- AMDGPU::Waitcnt &Wait,
- const MachineInstr *MI) {
+bool SIInsertWaitcnts::applyPreexistingWaitcnt(
+ WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
+ AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) {
bool Modified = false;
MachineInstr *WaitcntInstr = nullptr;
MachineInstr *WaitcntVsCntInstr = nullptr;
- for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
- &*II != MI; II = NextI, ++NextI) {
- if (II->isMetaInstruction())
+
+ for (auto &II :
+ make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
+ if (II.isMetaInstruction())
continue;
- if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+ if (II.getOpcode() == AMDGPU::S_WAITCNT) {
// Conservatively update required wait if this waitcnt was added in an
// earlier pass. In this case it will not exist in the tracked waitcnt
// set.
- if (!TrackedWaitcntSet.count(&*II)) {
- unsigned IEnc = II->getOperand(0).getImm();
+ if (!TrackedWaitcntSet.count(&II)) {
+ unsigned IEnc = II.getOperand(0).getImm();
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
Wait = Wait.combined(OldWait);
}
// Merge consecutive waitcnt of the same type by erasing multiples.
if (!WaitcntInstr) {
- WaitcntInstr = &*II;
+ WaitcntInstr = &II;
} else {
- II->eraseFromParent();
+ II.eraseFromParent();
Modified = true;
}
} else {
- assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
- assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
- if (!TrackedWaitcntSet.count(&*II)) {
+ assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+ if (!TrackedWaitcntSet.count(&II)) {
unsigned OldVSCnt =
- TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
+ TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
}
if (!WaitcntVsCntInstr) {
- WaitcntVsCntInstr = &*II;
+ WaitcntVsCntInstr = &II;
} else {
- II->eraseFromParent();
+ II.eraseFromParent();
Modified = true;
}
}
@@ -862,9 +893,14 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
Wait.LgkmCnt = ~0u;
Wait.ExpCnt = ~0u;
- LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << *MI << "New Instr: " << *WaitcntInstr
- << '\n');
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: " << *WaitcntInstr
+ << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntInstr << '\n');
+
} else {
WaitcntInstr->eraseFromParent();
Modified = true;
@@ -885,9 +921,13 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
ScoreBrackets.applyWaitcnt(Wait);
Wait.VsCnt = ~0u;
- LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << *MI
- << "New Instr: " << *WaitcntVsCntInstr << '\n');
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: "
+ << *WaitcntVsCntInstr << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntVsCntInstr << '\n');
} else {
WaitcntVsCntInstr->eraseFromParent();
Modified = true;
@@ -928,16 +968,18 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
/// and if so what the value of each counter is.
/// The "score bracket" is bound by the lower bound and upper bound
/// scores (*_score_LB and *_score_ub respectively).
-bool SIInsertWaitcnts::generateWaitcntInstBefore(
- MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr) {
+/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
+/// flush the vmcnt counter here.
+bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
+ WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr,
+ bool FlushVmCnt) {
setForceEmitWaitcnt();
if (MI.isMetaInstruction())
return false;
AMDGPU::Waitcnt Wait;
- bool Modified = false;
// FIXME: This should have already been handled by the memory legalizer.
// Removing this currently doesn't affect any lit tests, but we need to
@@ -955,16 +997,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// NOTE: this could be improved with knowledge of all call sites or
// with knowledge of the called routines.
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+ MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
- MI.getOpcode() == AMDGPU::S_SETPC_B64_return_gfx ||
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
}
// Resolve vm waits before gs-done.
else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
- ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
- AMDGPU::SendMsg::ID_GS_DONE)) {
+ ST->hasLegacyGeometry() &&
+ ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
+ AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
Wait.VmCnt = 0;
}
#if 0 // TODO: the following blocks of logic when we have fence.
@@ -1040,7 +1083,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
// The function is going to insert a wait on everything in its prolog.
// This still needs to be careful if the call target is a load (e.g. a GOT
- // load). We also need to check WAW depenancy with saved PC.
+ // load). We also need to check WAW dependency with saved PC.
Wait = AMDGPU::Waitcnt();
int CallAddrOpIdx =
@@ -1089,7 +1132,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
SLoadAddresses.erase(Ptr);
}
unsigned AS = Memop->getAddrSpace();
- if (AS != AMDGPUAS::LOCAL_ADDRESS)
+ if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
+ continue;
+ // No need to wait before load from VMEM to LDS.
+ if (mayWriteLDSThroughDMA(MI))
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
// VM_CNT is only relevant to vgpr or LDS.
@@ -1123,7 +1169,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
ScoreBrackets.clearVgprVmemTypes(RegNo);
}
- if (Op.isDef()) {
+ if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
ScoreBrackets.determineWait(
EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
}
@@ -1170,47 +1216,93 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
if (ForceEmitWaitcnt[VS_CNT])
Wait.VsCnt = 0;
- if (OldWaitcntInstr) {
+ if (FlushVmCnt) {
+ unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
+ unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
+ if (UB - LB != 0)
+ Wait.VmCnt = 0;
+ }
+
+ return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
+ OldWaitcntInstr);
+}
+
+// Add a waitcnt to flush the vmcnt counter at the end of the given block if
+// needed.
+bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
+ WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr) {
+ AMDGPU::Waitcnt Wait;
+
+ unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
+ unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
+ if (UB - LB == 0)
+ return false;
+
+ Wait.VmCnt = 0;
+
+ return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
+ OldWaitcntInstr);
+}
+
+bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
+ MachineBasicBlock::instr_iterator It,
+ MachineBasicBlock &Block,
+ WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr) {
+ bool Modified = false;
+ const DebugLoc &DL = Block.findDebugLoc(It);
+
+ if (OldWaitcntInstr)
// Try to merge the required wait with preexisting waitcnt instructions.
// Also erase redundant waitcnt.
Modified =
- applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
- } else {
- // Update waitcnt brackets after determining the required wait.
+ applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
+ else
ScoreBrackets.applyWaitcnt(Wait);
+
+ // ExpCnt can be merged into VINTERP.
+ if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
+ SIInstrInfo::isVINTERP(*It)) {
+ MachineOperand *WaitExp =
+ TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
+ if (Wait.ExpCnt < WaitExp->getImm()) {
+ WaitExp->setImm(Wait.ExpCnt);
+ Modified = true;
+ }
+ Wait.ExpCnt = ~0u;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
+ << "Update Instr: " << *It);
}
// Build new waitcnt instructions unless no wait is needed or the old waitcnt
// instruction was modified to handle the required wait.
if (Wait.hasWaitExceptVsCnt()) {
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
- auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
- MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
- .addImm(Enc);
+ auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
TrackedWaitcntSet.insert(SWaitInst);
Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI
- << "New Instr: " << *SWaitInst << '\n');
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
}
if (Wait.hasWaitVsCnt()) {
assert(ST->hasVscnt());
- auto SWaitInst =
- BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
- TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(Wait.VsCnt);
+ auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(Wait.VsCnt);
TrackedWaitcntSet.insert(SWaitInst);
Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI
- << "New Instr: " << *SWaitInst << '\n');
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
}
-
return Modified;
}
@@ -1338,6 +1430,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// May need to way wait for anything.
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
}
+ } else if (SIInstrInfo::isLDSDIR(Inst)) {
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
+ } else if (TII->isVINTERP(Inst)) {
+ int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
+ ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
} else if (SIInstrInfo::isEXP(Inst)) {
unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
@@ -1349,6 +1446,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
} else {
switch (Inst.getOpcode()) {
case AMDGPU::S_SENDMSG:
+ case AMDGPU::S_SENDMSG_RTN_B32:
+ case AMDGPU::S_SENDMSG_RTN_B64:
case AMDGPU::S_SENDMSGHALT:
ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
break;
@@ -1476,8 +1575,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
continue;
}
+ bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
+ isPreheaderToFlush(Block, ScoreBrackets);
+
// Generate an s_waitcnt instruction to be placed before Inst, if needed.
- Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
+ Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
+ FlushVmCnt);
OldWaitcntInstr = nullptr;
// Restore vccz if it's not known to be correct already.
@@ -1562,9 +1665,101 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
++Iter;
}
+ if (Block.getFirstTerminator() == Block.end() &&
+ isPreheaderToFlush(Block, ScoreBrackets))
+ Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
+
return Modified;
}
+// Return true if the given machine basic block is a preheader of a loop in
+// which we want to flush the vmcnt counter, and false otherwise.
+bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
+ WaitcntBrackets &ScoreBrackets) {
+ if (PreheadersToFlush.count(&MBB))
+ return PreheadersToFlush[&MBB];
+
+ auto UpdateCache = [&](bool val) {
+ PreheadersToFlush[&MBB] = val;
+ return val;
+ };
+
+ MachineBasicBlock *Succ = MBB.getSingleSuccessor();
+ if (!Succ)
+ return UpdateCache(false);
+
+ MachineLoop *Loop = MLI->getLoopFor(Succ);
+ if (!Loop)
+ return UpdateCache(false);
+
+ if (Loop->getLoopPreheader() == &MBB && shouldFlushVmCnt(Loop, ScoreBrackets))
+ return UpdateCache(true);
+
+ return UpdateCache(false);
+}
+
+// Return true if it is better to flush the vmcnt counter in the preheader of
+// the given loop. We currently decide to flush in two situations:
+// 1. The loop contains vmem store(s), no vmem load and at least one use of a
+// vgpr containing a value that is loaded outside of the loop. (Only on
+// targets with no vscnt counter).
+// 2. The loop contains vmem load(s), but the loaded values are not used in the
+// loop, and at least one use of a vgpr containing a value that is loaded
+// outside of the loop.
+bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
+ WaitcntBrackets &Brackets) {
+ bool HasVMemLoad = false;
+ bool HasVMemStore = false;
+ bool UsesVgprLoadedOutside = false;
+ DenseSet<Register> VgprUse;
+ DenseSet<Register> VgprDef;
+
+ for (MachineBasicBlock *MBB : ML->blocks()) {
+ for (MachineInstr &MI : *MBB) {
+ if (SIInstrInfo::isVMEM(MI)) {
+ if (MI.mayLoad())
+ HasVMemLoad = true;
+ if (MI.mayStore())
+ HasVMemStore = true;
+ }
+ for (unsigned I = 0; I < MI.getNumOperands(); I++) {
+ MachineOperand &Op = MI.getOperand(I);
+ if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
+ continue;
+ RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I);
+ // Vgpr use
+ if (Op.isUse()) {
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ // If we find a register that is loaded inside the loop, 1. and 2.
+ // are invalidated and we can exit.
+ if (VgprDef.contains(RegNo))
+ return false;
+ VgprUse.insert(RegNo);
+ // If at least one of Op's registers is in the score brackets, the
+ // value is likely loaded outside of the loop.
+ if (Brackets.getRegScore(RegNo, VM_CNT) > 0) {
+ UsesVgprLoadedOutside = true;
+ break;
+ }
+ }
+ }
+ // VMem load vgpr def
+ else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef())
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ // If we find a register that is loaded inside the loop, 1. and 2.
+ // are invalidated and we can exit.
+ if (VgprUse.contains(RegNo))
+ return false;
+ VgprDef.insert(RegNo);
+ }
+ }
+ }
+ }
+ if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
+ return true;
+ return HasVMemLoad && UsesVgprLoadedOutside;
+}
+
bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
@@ -1572,6 +1767,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
IV = AMDGPU::getIsaVersion(ST->getCPU());
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MLI = &getAnalysis<MachineLoopInfo>();
PDT = &getAnalysis<MachinePostDominatorTree>();
ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index e39f52875f1f..b398e108bf62 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -48,6 +48,12 @@ class InstSI <dag outs, dag ins, string asm = "",
field bit VGPRSpill = 0;
field bit SGPRSpill = 0;
+ // LDSDIR instruction format.
+ field bit LDSDIR = 0;
+
+ // VINTERP instruction format.
+ field bit VINTERP = 0;
+
// High bits - other information.
field bit VM_CNT = 0;
field bit EXP_CNT = 0;
@@ -141,6 +147,9 @@ class InstSI <dag outs, dag ins, string asm = "",
// Atomic with return.
field bit IsAtomicRet = 0;
+ // This bit indicates that this is one of WMMA instructions.
+ field bit IsWMMA = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -173,6 +182,9 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{24} = VGPRSpill;
let TSFlags{25} = SGPRSpill;
+ let TSFlags{26} = LDSDIR;
+ let TSFlags{27} = VINTERP;
+
let TSFlags{32} = VM_CNT;
let TSFlags{33} = EXP_CNT;
let TSFlags{34} = LGKM_CNT;
@@ -215,6 +227,8 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{58} = IsAtomicRet;
+ let TSFlags{59} = IsWMMA;
+
let SchedRW = [Write32Bit];
let AsmVariantName = AMDGPUAsmVariants.Default;
@@ -261,6 +275,11 @@ class Enc64 {
int Size = 8;
}
+class Enc96 {
+ field bits<96> Inst;
+ int Size = 12;
+}
+
def CPolBit {
int GLC = 0;
int SLC = 1;
@@ -284,7 +303,7 @@ class VINTRPe <bits<2> op> : Enc32 {
let Inst{31-26} = 0x32; // encoding
}
-class MIMGe : Enc64 {
+class MIMGe_gfxpre11 : Enc64 {
bits<10> vdata;
bits<4> dmask;
bits<1> unorm;
@@ -309,7 +328,7 @@ class MIMGe : Enc64 {
let Inst{63} = d16;
}
-class MIMGe_gfx6789 <bits<8> op> : MIMGe {
+class MIMGe_gfx6789 <bits<8> op> : MIMGe_gfxpre11 {
bits<8> vaddr;
bits<1> da;
@@ -321,7 +340,7 @@ class MIMGe_gfx6789 <bits<8> op> : MIMGe {
let Inst{39-32} = vaddr;
}
-class MIMGe_gfx90a <bits<8> op> : MIMGe {
+class MIMGe_gfx90a <bits<8> op> : MIMGe_gfxpre11 {
bits<8> vaddr;
bits<1> da;
@@ -333,7 +352,7 @@ class MIMGe_gfx90a <bits<8> op> : MIMGe {
let Inst{39-32} = vaddr;
}
-class MIMGe_gfx10 <bits<8> op> : MIMGe {
+class MIMGe_gfx10 <bits<8> op> : MIMGe_gfxpre11 {
bits<8> vaddr0;
bits<3> dim;
bits<2> nsa;
@@ -349,12 +368,46 @@ class MIMGe_gfx10 <bits<8> op> : MIMGe {
let Inst{62} = a16;
}
+class MIMGe_gfx11 <bits<8> op> : Enc64 {
+ bits<8> vdata;
+ bits<4> dmask;
+ bits<1> unorm;
+ bits<5> cpol;
+ bits<1> r128;
+ bits<1> tfe;
+ bits<1> lwe;
+ bits<7> srsrc;
+ bits<7> ssamp;
+ bit d16;
+ bits<1> a16;
+ bits<8> vaddr0;
+ bits<3> dim;
+ bits<1> nsa;
+
+ let Inst{0} = nsa;
+ let Inst{4-2} = dim;
+ let Inst{7} = unorm;
+ let Inst{11-8} = dmask;
+ let Inst{12} = cpol{CPolBit.SLC};
+ let Inst{13} = cpol{CPolBit.DLC};
+ let Inst{14} = cpol{CPolBit.GLC};
+ let Inst{15} = r128;
+ let Inst{16} = a16;
+ let Inst{17} = d16;
+ let Inst{25-18} = op;
+ let Inst{31-26} = 0x3c;
+ let Inst{39-32} = vaddr0;
+ let Inst{47-40} = vdata;
+ let Inst{52-48} = srsrc{6-2};
+ let Inst{53} = tfe;
+ let Inst{54} = lwe;
+ let Inst{62-58} = ssamp{6-2};
+}
+
class EXPe : Enc64 {
bits<4> en;
bits<6> tgt;
- bits<1> compr;
bits<1> done;
- bits<1> vm;
bits<8> src0;
bits<8> src1;
bits<8> src2;
@@ -362,9 +415,7 @@ class EXPe : Enc64 {
let Inst{3-0} = en;
let Inst{9-4} = tgt;
- let Inst{10} = compr;
let Inst{11} = done;
- let Inst{12} = vm;
let Inst{31-26} = 0x3e;
let Inst{39-32} = src0;
let Inst{47-40} = src1;
@@ -372,6 +423,22 @@ class EXPe : Enc64 {
let Inst{63-56} = src3;
}
+// Pre-GFX11 encoding has compr and vm bits.
+class EXPe_ComprVM : EXPe {
+ bits<1> compr;
+ bits<1> vm;
+
+ let Inst{10} = compr;
+ let Inst{12} = vm;
+}
+
+// GFX11+ encoding has row bit.
+class EXPe_Row : EXPe {
+ bits<1> row;
+
+ let Inst{13} = row;
+}
+
let Uses = [EXEC] in {
class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0a2f9381e71f..814a7c446889 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -16,12 +16,12 @@
#include "AMDGPUInstrInfo.h"
#include "GCNHazardRecognizer.h"
#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/ScheduleDAG.h"
@@ -130,9 +130,31 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
return false;
}
-static bool readsExecAsData(const MachineInstr &MI) {
- if (MI.isCompare())
- return true;
+// Returns true if the scalar result of a VALU instruction depends on exec.
+static bool resultDependsOnExec(const MachineInstr &MI) {
+ // Ignore comparisons which are only used masked with exec.
+ // This allows some hoisting/sinking of VALU comparisons.
+ if (MI.isCompare()) {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ Register DstReg = MI.getOperand(0).getReg();
+ if (!DstReg.isVirtual())
+ return true;
+ for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
+ switch (Use.getOpcode()) {
+ case AMDGPU::S_AND_SAVEEXEC_B32:
+ case AMDGPU::S_AND_SAVEEXEC_B64:
+ break;
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_AND_B64:
+ if (!Use.readsRegister(AMDGPU::EXEC))
+ return true;
+ break;
+ default:
+ return true;
+ }
+ }
+ return false;
+ }
switch (MI.getOpcode()) {
default:
@@ -147,7 +169,7 @@ static bool readsExecAsData(const MachineInstr &MI) {
bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
// Any implicit use of exec by VALU is not a real register read.
return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
- isVALU(*MO.getParent()) && !readsExecAsData(*MO.getParent());
+ isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
}
bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
@@ -181,7 +203,7 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
if (Offset0Idx == -1 || Offset1Idx == -1)
return false;
- // XXX - be careful of datalesss loads
+ // XXX - be careful of dataless loads
// getNamedOperandIdx returns the index for MachineInstrs. Since they
// include the output in the operand list, but SDNodes don't, we need to
// subtract the index by one.
@@ -362,6 +384,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
if (DataOpIdx == -1)
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+ if (DataOpIdx == -1) // LDS DMA
+ return false;
Width = getOpSize(LdSt, DataOpIdx);
return true;
}
@@ -410,6 +434,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
if (DataOpIdx == -1)
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+ if (DataOpIdx == -1) // LDS DMA
+ return false;
Width = getOpSize(LdSt, DataOpIdx);
return true;
}
@@ -464,7 +490,7 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
return false;
}
- // In order to avoid regester pressure, on an average, the number of DWORDS
+ // In order to avoid register pressure, on an average, the number of DWORDS
// loaded together by all clustered mem ops should not exceed 8. This is an
// empirical value based on certain observations and performance related
// experiments.
@@ -517,8 +543,9 @@ static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
.addReg(SrcReg, getKillRegState(KillSrc));
}
-/// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible
-/// to directly copy, so an intermediate VGPR needs to be used.
+/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
+/// possible to have a direct copy in these cases on GFX908, so an intermediate
+/// VGPR copy is required.
static void indirectCopyToAGPR(const SIInstrInfo &TII,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
@@ -527,10 +554,18 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
RegScavenger &RS,
Register ImpDefSuperReg = Register(),
Register ImpUseSuperReg = Register()) {
- const SIRegisterInfo &RI = TII.getRegisterInfo();
+ assert((TII.getSubtarget().hasMAIInsts() &&
+ !TII.getSubtarget().hasGFX90AInsts()) &&
+ "Expected GFX908 subtarget.");
- assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||
- AMDGPU::AGPR_32RegClass.contains(SrcReg));
+ assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
+ AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
+ "Source register of the copy should be either an SGPR or an AGPR.");
+
+ assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
+ "Destination register of the copy should be an AGPR.");
+
+ const SIRegisterInfo &RI = TII.getRegisterInfo();
// First try to find defining accvgpr_write to avoid temporary registers.
for (auto Def = MI, E = MBB.begin(); Def != E; ) {
@@ -581,23 +616,21 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
// Registers in the sequence are allocated contiguously so we can just
// use register number to pick one of three round-robin temps.
- unsigned RegNo = DestReg % 3;
- Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
- if (!Tmp)
- report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
- RS.setRegUsed(Tmp);
+ unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
+ Register Tmp =
+ MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
+ assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
+ "VGPR used for an intermediate copy should have been reserved.");
- if (!TII.getSubtarget().hasGFX90AInsts()) {
- // Only loop through if there are any free registers left, otherwise
- // scavenger may report a fatal error without emergency spill slot
- // or spill with the slot.
- while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
- Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
- if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
- break;
- Tmp = Tmp2;
- RS.setRegUsed(Tmp);
- }
+ // Only loop through if there are any free registers left, otherwise
+ // scavenger may report a fatal error without emergency spill slot
+ // or spill with the slot.
+ while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
+ Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+ if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
+ break;
+ Tmp = Tmp2;
+ RS.setRegUsed(Tmp);
}
// Insert copy to temporary VGPR.
@@ -796,7 +829,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (RC == &AMDGPU::AGPR_32RegClass) {
- if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
+ if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
+ (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
@@ -884,6 +918,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg);
if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
+ if (ST.hasMovB64()) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
if (ST.hasPackedFP32Ops()) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
.addImm(SISrcMods::OP_SEL_1)
@@ -906,7 +945,9 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
- expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward);
+ const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
+ expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
+ Forward);
return;
}
@@ -915,7 +956,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (RI.isAGPRClass(RC)) {
if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
- else if (RI.hasVGPRs(SrcRC))
+ else if (RI.hasVGPRs(SrcRC) ||
+ (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
else
Opcode = AMDGPU::INSTRUCTION_LIST_END;
@@ -925,7 +967,10 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
(RI.isProperlyAlignedRC(*RC) &&
(SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
// TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
- if (ST.hasPackedFP32Ops()) {
+ if (ST.hasMovB64()) {
+ Opcode = AMDGPU::V_MOV_B64_e32;
+ EltSize = 8;
+ } else if (ST.hasPackedFP32Ops()) {
Opcode = AMDGPU::V_PK_MOV_B32;
EltSize = 8;
}
@@ -1725,13 +1770,8 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
case AMDGPU::S_NOP:
return MI.getOperand(0).getImm() + 1;
-
- // FIXME: Any other pseudo instruction?
// SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
// hazard, even if one exist, won't really be visible. Should we handle it?
- case AMDGPU::SI_MASKED_UNREACHABLE:
- case AMDGPU::WAVE_BARRIER:
- return 0;
}
}
@@ -1807,6 +1847,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
const MachineOperand &SrcOp = MI.getOperand(1);
// FIXME: Will this work for 64-bit floating point immediates?
assert(!SrcOp.isFPImm());
+ if (ST.hasMovB64()) {
+ MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
+ if (!isLiteralConstant(MI, 1) || isUInt<32>(SrcOp.getImm()))
+ break;
+ }
if (SrcOp.isImm()) {
APInt Imm(64, SrcOp.getImm());
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
@@ -1887,6 +1932,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::V_SET_INACTIVE_B32: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
+ // optimizations (mainly Register Coalescer) aware of WWM register liveness.
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
+ .add(MI.getOperand(1));
auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
@@ -1899,11 +1948,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::V_SET_INACTIVE_B64: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
+ .add(MI.getOperand(1));
+ expandPostRAPseudo(*Copy);
+ auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
+ FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
+ Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
+ MI.getOperand(0).getReg())
+ .add(MI.getOperand(2));
expandPostRAPseudo(*Copy);
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
.addReg(Exec);
@@ -2085,6 +2138,23 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
+ case AMDGPU::SI_RETURN: {
+ const MachineFunction *MF = MBB.getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ // Hiding the return address use with SI_RETURN may lead to extra kills in
+ // the function and missing live-ins. We are fine in practice because callee
+ // saved register handling ensures the register value is restored before
+ // RET, but we need the undef flag here to appease the MachineVerifier
+ // liveness checks.
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
+ .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
+
+ MIB.copyImplicitOps(MI);
+ MI.eraseFromParent();
+ break;
+ }
}
return true;
}
@@ -2093,6 +2163,13 @@ std::pair<MachineInstr*, MachineInstr*>
SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
+ if (ST.hasMovB64() &&
+ AMDGPU::isLegal64BitDPPControl(
+ getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
+ MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
+ return std::make_pair(&MI, nullptr);
+ }
+
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
MachineFunction *MF = MBB.getParent();
@@ -2789,6 +2866,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::V_MOV_B64_e32:
+ case AMDGPU::V_MOV_B64_e64:
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B64:
case AMDGPU::COPY:
@@ -2801,35 +2880,15 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
}
}
-unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
- unsigned Kind) const {
- switch(Kind) {
- case PseudoSourceValue::Stack:
- case PseudoSourceValue::FixedStack:
- return AMDGPUAS::PRIVATE_ADDRESS;
- case PseudoSourceValue::ConstantPool:
- case PseudoSourceValue::GOT:
- case PseudoSourceValue::JumpTable:
- case PseudoSourceValue::GlobalValueCallEntry:
- case PseudoSourceValue::ExternalSymbolCallEntry:
- case PseudoSourceValue::TargetCustom:
- return AMDGPUAS::CONSTANT_ADDRESS;
- }
- return AMDGPUAS::FLAT_ADDRESS;
-}
+static constexpr unsigned ModifierOpNames[] = {
+ AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
+ AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
+ AMDGPU::OpName::omod};
-static void removeModOperands(MachineInstr &MI) {
+void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
- int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
- AMDGPU::OpName::src0_modifiers);
- int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
- AMDGPU::OpName::src1_modifiers);
- int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
- AMDGPU::OpName::src2_modifiers);
-
- MI.RemoveOperand(Src2ModIdx);
- MI.RemoveOperand(Src1ModIdx);
- MI.RemoveOperand(Src0ModIdx);
+ for (unsigned Name : reverse(ModifierOpNames))
+ MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, Name));
}
bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
@@ -2841,7 +2900,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
default:
return false;
case AMDGPU::S_MOV_B64:
- // TODO: We could fold 64-bit immediates, but this get compilicated
+ // TODO: We could fold 64-bit immediates, but this get complicated
// when there are sub-registers.
return false;
@@ -2921,7 +2980,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
// Multiplied part is the constant: Use v_madmk_{f16, f32}.
- // We should only expect these to be on src0 due to canonicalizations.
+ // We should only expect these to be on src0 due to canonicalization.
if (Src0->isReg() && Src0->getReg() == Reg) {
if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
return false;
@@ -2942,12 +3001,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
- // Remove these first since they are at the end.
- UseMI.RemoveOperand(
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
- UseMI.RemoveOperand(
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
-
Register Src1Reg = Src1->getReg();
unsigned Src1SubReg = Src1->getSubReg();
Src0->setReg(Src1Reg);
@@ -2966,7 +3019,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
removeModOperands(UseMI);
UseMI.setDesc(get(NewOpc));
- bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+ bool DeleteDef = MRI->use_nodbg_empty(Reg);
if (DeleteDef)
DefMI.eraseFromParent();
@@ -3025,12 +3078,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
- // Remove these first since they are at the end.
- UseMI.RemoveOperand(
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
- UseMI.RemoveOperand(
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
-
if (Opc == AMDGPU::V_MAC_F32_e64 ||
Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 ||
@@ -3049,7 +3096,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// constant and SGPR are illegal.
legalizeOperands(UseMI);
- bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+ bool DeleteDef = MRI->use_nodbg_empty(Reg);
if (DeleteDef)
DefMI.eraseFromParent();
@@ -3192,34 +3239,68 @@ static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
+ MachineBasicBlock &MBB = *MI.getParent();
unsigned Opc = MI.getOpcode();
- bool IsF16 = false;
+
+ // Handle MFMA.
+ int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
+ if (NewMFMAOpc != -1) {
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
+ updateLiveVariables(LV, MI, *MIB);
+ if (LIS)
+ LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+ return MIB;
+ }
+
+ if (SIInstrInfo::isWMMA(MI)) {
+ unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
+ .setMIFlags(MI.getFlags());
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ MIB->addOperand(MI.getOperand(I));
+
+ updateLiveVariables(LV, MI, *MIB);
+ if (LIS)
+ LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+
+ return MIB;
+ }
+
+ // Handle MAC/FMAC.
+ bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
+ Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
- int NewMFMAOpc = -1;
+ bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
+ Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
+ Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
+ bool Src0Literal = false;
switch (Opc) {
default:
- NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
- if (NewMFMAOpc == -1)
- return nullptr;
- break;
+ return nullptr;
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
- IsF16 = true;
- LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e64:
+ case AMDGPU::V_MAC_LEGACY_F32_e64:
case AMDGPU::V_FMAC_F32_e64:
+ case AMDGPU::V_FMAC_LEGACY_F32_e64:
case AMDGPU::V_FMAC_F64_e64:
break;
case AMDGPU::V_MAC_F16_e32:
case AMDGPU::V_FMAC_F16_e32:
- IsF16 = true;
- LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e32:
+ case AMDGPU::V_MAC_LEGACY_F32_e32:
case AMDGPU::V_FMAC_F32_e32:
+ case AMDGPU::V_FMAC_LEGACY_F32_e32:
case AMDGPU::V_FMAC_F64_e32: {
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::src0);
@@ -3228,25 +3309,13 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return nullptr;
if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
- return nullptr;
+ Src0Literal = true;
break;
}
}
MachineInstrBuilder MIB;
- MachineBasicBlock &MBB = *MI.getParent();
-
- if (NewMFMAOpc != -1) {
- MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
- MIB.add(MI.getOperand(I));
- updateLiveVariables(LV, MI, *MIB);
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
- return MIB;
- }
-
const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
const MachineOperand *Src0Mods =
@@ -3255,10 +3324,13 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
const MachineOperand *Src1Mods =
getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
+ const MachineOperand *Src2Mods =
+ getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
- if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 &&
+ if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
+ !IsLegacy &&
// If we have an SGPR input, we will violate the constant bus restriction.
(ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
!RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
@@ -3271,11 +3343,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
// We cannot just remove the DefMI here, calling pass will crash.
DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
- DefMI->RemoveOperand(I);
+ DefMI->removeOperand(I);
};
int64_t Imm;
- if (getFoldableImm(Src2, Imm, &DefMI)) {
+ if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
unsigned NewOpc =
IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
@@ -3295,7 +3367,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
unsigned NewOpc = IsFMA
? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
- if (getFoldableImm(Src1, Imm, &DefMI)) {
+ if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
if (pseudoToMCOpcode(NewOpc) != -1) {
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
@@ -3309,7 +3381,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
}
- if (getFoldableImm(Src0, Imm, &DefMI)) {
+ if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
+ if (Src0Literal) {
+ Imm = Src0->getImm();
+ DefMI = nullptr;
+ }
if (pseudoToMCOpcode(NewOpc) != -1 &&
isOperandLegal(
MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
@@ -3322,16 +3398,27 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
updateLiveVariables(LV, MI, *MIB);
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *MIB);
- killDef();
+ if (DefMI)
+ killDef();
return MIB;
}
}
}
- unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
- : IsF64 ? AMDGPU::V_FMA_F64_e64
- : AMDGPU::V_FMA_F32_e64)
- : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
+ // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
+ // because VOP3 does not allow a literal operand.
+ // TODO: Remove this restriction for GFX10.
+ if (Src0Literal)
+ return nullptr;
+
+ unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
+ : IsF64 ? AMDGPU::V_FMA_F64_e64
+ : IsLegacy
+ ? AMDGPU::V_FMA_LEGACY_F32_e64
+ : AMDGPU::V_FMA_F32_e64
+ : IsF16 ? AMDGPU::V_MAD_F16_e64
+ : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
+ : AMDGPU::V_MAD_F32_e64;
if (pseudoToMCOpcode(NewOpc) == -1)
return nullptr;
@@ -3341,7 +3428,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.add(*Src0)
.addImm(Src1Mods ? Src1Mods->getImm() : 0)
.add(*Src1)
- .addImm(0) // Src mods
+ .addImm(Src2Mods ? Src2Mods->getImm() : 0)
.add(*Src2)
.addImm(Clamp ? Clamp->getImm() : 0)
.addImm(Omod ? Omod->getImm() : 0);
@@ -3383,6 +3470,9 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
return true;
+ if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
+ return true;
+
// Target-independent instructions do not have an implicit-use of EXEC, even
// when they operate on VGPRs. Treating EXEC modifications as scheduling
// boundaries prevents incorrect movements of such instructions.
@@ -3676,11 +3766,8 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
}
bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
- return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
- hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
- hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
- hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
- hasModifiersSet(MI, AMDGPU::OpName::omod);
+ return any_of(ModifierOpNames,
+ [&](unsigned Name) { return hasModifiersSet(MI, Name); });
}
bool SIInstrInfo::canShrink(const MachineInstr &MI,
@@ -3754,18 +3841,19 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
unsigned Op32) const {
- MachineBasicBlock *MBB = MI.getParent();;
+ MachineBasicBlock *MBB = MI.getParent();
MachineInstrBuilder Inst32 =
BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
.setMIFlags(MI.getFlags());
// Add the dst operand if the 32-bit encoding also has an explicit $vdst.
// For VOPC instructions, this is replaced by an implicit def of vcc.
- int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
- if (Op32DstIdx != -1) {
+ if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst) != -1) {
// dst
Inst32.add(MI.getOperand(0));
- } else {
+ } else if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::sdst) != -1) {
+ // VOPCX instructions won't be writing to an explicit dst, so this should
+ // not fail for these instructions.
assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
(MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
"Unexpected case");
@@ -3816,7 +3904,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
// Null is free
- if (MO.getReg() == AMDGPU::SGPR_NULL)
+ if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
return false;
// SGPRs use the constant bus
@@ -3951,6 +4039,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
case AMDGPU::OPERAND_REG_IMM_INT32:
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
break;
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
@@ -4031,9 +4120,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
- const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
-
- for (int OpIdx: OpIndicies) {
+ for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
if (OpIdx == -1)
continue;
const MachineOperand &MO = MI.getOperand(OpIdx);
@@ -4150,24 +4237,25 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
// Verify VOP*. Ignore multiple sgpr operands on writelane.
- if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
- && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
- // Only look at the true operands. Only a real operand can use the constant
- // bus, and we don't want to check pseudo-operands like the source modifier
- // flags.
- const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
-
+ if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
unsigned ConstantBusCount = 0;
bool UsesLiteral = false;
const MachineOperand *LiteralVal = nullptr;
- if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
+ int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
+ if (ImmIdx != -1) {
++ConstantBusCount;
+ UsesLiteral = true;
+ LiteralVal = &MI.getOperand(ImmIdx);
+ }
SmallVector<Register, 2> SGPRsUsed;
Register SGPRUsed;
- for (int OpIdx : OpIndices) {
+ // Only look at the true operands. Only a real operand can use the constant
+ // bus, and we don't want to check pseudo-operands like the source modifier
+ // flags.
+ for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
if (OpIdx == -1)
break;
const MachineOperand &MO = MI.getOperand(OpIdx);
@@ -4186,8 +4274,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
UsesLiteral = true;
LiteralVal = &MO;
} else if (!MO.isIdenticalTo(*LiteralVal)) {
- assert(isVOP3(MI));
- ErrInfo = "VOP3 instruction uses more than one literal";
+ assert(isVOP2(MI) || isVOP3(MI));
+ ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
return false;
}
}
@@ -4196,7 +4284,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
SGPRUsed = findImplicitSGPRRead(MI);
if (SGPRUsed != AMDGPU::NoRegister) {
- // Implicit uses may safely overlap true overands
+ // Implicit uses may safely overlap true operands
if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
return !RI.regsOverlap(SGPRUsed, SGPR);
})) {
@@ -4225,7 +4313,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
unsigned SGPRCount = 0;
Register SGPRUsed = AMDGPU::NoRegister;
- for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
+ for (int OpIdx : {Src0Idx, Src1Idx}) {
if (OpIdx == -1)
break;
@@ -4272,16 +4360,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
if (isSOP2(MI) || isSOPC(MI)) {
const MachineOperand &Src0 = MI.getOperand(Src0Idx);
const MachineOperand &Src1 = MI.getOperand(Src1Idx);
- unsigned Immediates = 0;
- if (!Src0.isReg() &&
- !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType))
- Immediates++;
- if (!Src1.isReg() &&
- !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType))
- Immediates++;
-
- if (Immediates > 1) {
+ if (!Src0.isReg() && !Src1.isReg() &&
+ !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType) &&
+ !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType) &&
+ !Src0.isIdenticalTo(Src1)) {
ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
return false;
}
@@ -4364,10 +4447,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
if (isSMRD(MI)) {
- if (MI.mayStore()) {
+ if (MI.mayStore() &&
+ ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// The register offset form of scalar stores may only use m0 as the
// soffset register.
- const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
+ const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
if (Soff && Soff->getReg() != AMDGPU::M0) {
ErrInfo = "scalar stores must use m0 as offset register";
return false;
@@ -4477,7 +4561,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
- int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
((DstIdx >= 0 &&
@@ -4527,24 +4610,45 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
- if (ST.needsAlignedVGPRs() &&
- (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
- MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
- MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) {
- const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0);
- Register Reg = Op->getReg();
- bool Aligned = true;
- if (Reg.isPhysical()) {
- Aligned = !(RI.getHWRegIndex(Reg) & 1);
- } else {
+ if (ST.needsAlignedVGPRs()) {
+ const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
+ const MachineOperand *Op = getNamedOperand(MI, OpName);
+ if (!Op)
+ return true;
+ Register Reg = Op->getReg();
+ if (Reg.isPhysical())
+ return !(RI.getHWRegIndex(Reg) & 1);
const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
- Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
- !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
+ return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
+ !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
+ };
+
+ if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
+ MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
+ MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
+
+ if (!isAlignedReg(AMDGPU::OpName::data0)) {
+ ErrInfo = "Subtarget requires even aligned vector registers "
+ "for DS_GWS instructions";
+ return false;
+ }
+ }
+
+ if (isMIMG(MI)) {
+ if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
+ ErrInfo = "Subtarget requires even aligned vector registers "
+ "for vaddr operand of image instructions";
+ return false;
+ }
}
+ }
- if (!Aligned) {
- ErrInfo = "Subtarget requires even aligned vector registers "
- "for DS_GWS instructions";
+ if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ !ST.hasGFX90AInsts()) {
+ const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
+ if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
+ ErrInfo = "Invalid register class: "
+ "v_accvgpr_write with an SGPR is not supported on this GPU";
return false;
}
}
@@ -4641,26 +4745,40 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
"Unexpected scalar opcode without corresponding vector one!");
}
-static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
- const MachineRegisterInfo &MRI,
- const MCInstrDesc &TID,
- unsigned RCID,
- bool IsAllocatable) {
+static const TargetRegisterClass *
+adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
+ const MachineRegisterInfo &MRI,
+ const MCInstrDesc &TID, unsigned RCID,
+ bool IsAllocatable) {
if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
(((TID.mayLoad() || TID.mayStore()) &&
!(TID.TSFlags & SIInstrFlags::VGPRSpill)) ||
(TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
switch (RCID) {
- case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
- case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;
- case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID;
- case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID;
- case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID;
+ case AMDGPU::AV_32RegClassID:
+ RCID = AMDGPU::VGPR_32RegClassID;
+ break;
+ case AMDGPU::AV_64RegClassID:
+ RCID = AMDGPU::VReg_64RegClassID;
+ break;
+ case AMDGPU::AV_96RegClassID:
+ RCID = AMDGPU::VReg_96RegClassID;
+ break;
+ case AMDGPU::AV_128RegClassID:
+ RCID = AMDGPU::VReg_128RegClassID;
+ break;
+ case AMDGPU::AV_160RegClassID:
+ RCID = AMDGPU::VReg_160RegClassID;
+ break;
+ case AMDGPU::AV_512RegClassID:
+ RCID = AMDGPU::VReg_512RegClassID;
+ break;
default:
break;
}
}
- return RCID;
+
+ return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
}
const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
@@ -4673,7 +4791,7 @@ const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
bool IsAllocatable = false;
if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
// vdst and vdata should be both VGPR or AGPR, same for the DS instructions
- // with two data operands. Request register class constainted to VGPR only
+ // with two data operands. Request register class constrained to VGPR only
// of both operands present as Machine Copy Propagation can not check this
// constraint and possibly other passes too.
//
@@ -4690,9 +4808,8 @@ const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
AMDGPU::OpName::data1) != -1;
}
}
- RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass,
- IsAllocatable);
- return RI.getRegClass(RegClass);
+ return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
+ IsAllocatable);
}
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
@@ -4709,8 +4826,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
}
unsigned RCID = Desc.OpInfo[OpNo].RegClass;
- RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true);
- return RI.getRegClass(RCID);
+ return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
}
void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
@@ -4797,7 +4913,7 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
assert(Inst.getNumExplicitOperands() == 3);
MachineOperand Op1 = Inst.getOperand(1);
- Inst.RemoveOperand(1);
+ Inst.removeOperand(1);
Inst.addOperand(Op1);
}
@@ -4851,9 +4967,9 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
MO = &MI.getOperand(OpIdx);
int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
- int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+ int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
- if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--)
+ if (isLiteralConstantLike(*MO, OpInfo) && !LiteralLimit--)
return false;
SmallDenseSet<RegSubRegPair> SGPRsUsed;
@@ -4872,12 +4988,10 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
return false;
SGPRsUsed.insert(SGPR);
}
- } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
- if (--ConstantBusLimit <= 0)
- return false;
- } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) &&
- isLiteralConstantLike(Op, InstDesc.OpInfo[i])) {
- if (!VOP3LiteralLimit--)
+ } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32 ||
+ (AMDGPU::isSISrcOperand(InstDesc, i) &&
+ isLiteralConstantLike(Op, InstDesc.OpInfo[i]))) {
+ if (!LiteralLimit--)
return false;
if (--ConstantBusLimit <= 0)
return false;
@@ -4886,7 +5000,10 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
}
if (MO->isReg()) {
- assert(DefinedRC);
+ if (!DefinedRC) {
+ // This operand allows any register.
+ return true;
+ }
if (!isLegalRegOperand(MRI, OpInfo, *MO))
return false;
bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
@@ -4916,7 +5033,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
return false;
}
- if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
(int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
RI.isSGPRReg(MRI, MO->getReg()))
return false;
@@ -5186,7 +5303,7 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
SBase->setReg(SGPR);
}
- MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
+ MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
SOff->setReg(SGPR);
@@ -5232,16 +5349,16 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
const MCInstrDesc &NewDesc = get(NewOpc);
Inst.setDesc(NewDesc);
- // Callers expect interator to be valid after this call, so modify the
+ // Callers expect iterator to be valid after this call, so modify the
// instruction in place.
if (OldVAddrIdx == NewVAddrIdx) {
MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
// Clear use list from the old vaddr holding a zero register.
MRI.removeRegOperandFromUseList(&NewVAddr);
MRI.moveOperands(&NewVAddr, &SAddr, 1);
- Inst.RemoveOperand(OldSAddrIdx);
+ Inst.removeOperand(OldSAddrIdx);
// Update the use list with the pointer we have just moved from vaddr to
- // saddr poisition. Otherwise new vaddr will be missing from the use list.
+ // saddr position. Otherwise new vaddr will be missing from the use list.
MRI.removeRegOperandFromUseList(&NewVAddr);
MRI.addRegOperandToUseList(&NewVAddr);
} else {
@@ -5251,14 +5368,14 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
AMDGPU::OpName::vdst_in);
- // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so
+ // removeOperand doesn't try to fixup tied operand indexes at it goes, so
// it asserts. Untie the operands for now and retie them afterwards.
if (NewVDstIn != -1) {
int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
Inst.untieRegOperand(OldVDstIn);
}
- Inst.RemoveOperand(OldVAddrIdx);
+ Inst.removeOperand(OldVAddrIdx);
if (NewVDstIn != -1) {
int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
@@ -5340,7 +5457,8 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
static void
emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
- const DebugLoc &DL, MachineOperand &Rsrc) {
+ MachineBasicBlock &BodyBB, const DebugLoc &DL,
+ MachineOperand &Rsrc) {
MachineFunction &MF = *OrigBB.getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -5398,7 +5516,7 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
else
Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2));
- // Combine the comparision results with AND.
+ // Combine the comparison results with AND.
if (CondReg == AMDGPU::NoRegister) // First.
CondReg = NewCondReg;
else { // If not the first, we create an AND.
@@ -5433,14 +5551,14 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
.addReg(CondReg, RegState::Kill);
// The original instruction is here; we insert the terminators after it.
- I = LoopBB.end();
+ I = BodyBB.end();
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
- BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec)
+ BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
.addReg(Exec)
.addReg(SaveExec);
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
+ BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
}
// Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
@@ -5487,31 +5605,35 @@ loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
// To insert the loop we need to split the block. Move everything after this
// point to a new block, and insert a new empty block between the two.
MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
MachineFunction::iterator MBBI(MBB);
++MBBI;
MF.insert(MBBI, LoopBB);
+ MF.insert(MBBI, BodyBB);
MF.insert(MBBI, RemainderBB);
- LoopBB->addSuccessor(LoopBB);
- LoopBB->addSuccessor(RemainderBB);
+ LoopBB->addSuccessor(BodyBB);
+ BodyBB->addSuccessor(LoopBB);
+ BodyBB->addSuccessor(RemainderBB);
- // Move Begin to MI to the LoopBB, and the remainder of the block to
+ // Move Begin to MI to the BodyBB, and the remainder of the block to
// RemainderBB.
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
- LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end());
+ BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
MBB.addSuccessor(LoopBB);
// Update dominators. We know that MBB immediately dominates LoopBB, that
- // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
- // dominates all of the successors transferred to it from MBB that MBB used
- // to properly dominate.
+ // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
+ // RemainderBB. RemainderBB immediately dominates all of the successors
+ // transferred to it from MBB that MBB used to properly dominate.
if (MDT) {
MDT->addNewBlock(LoopBB, &MBB);
- MDT->addNewBlock(RemainderBB, LoopBB);
+ MDT->addNewBlock(BodyBB, LoopBB);
+ MDT->addNewBlock(RemainderBB, BodyBB);
for (auto &Succ : RemainderBB->successors()) {
if (MDT->properlyDominates(&MBB, Succ)) {
MDT->changeImmediateDominator(Succ, RemainderBB);
@@ -5519,12 +5641,12 @@ loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
}
}
- emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
+ emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, Rsrc);
// Restore the EXEC mask
MachineBasicBlock::iterator First = RemainderBB->begin();
BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
- return LoopBB;
+ return BodyBB;
}
// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
@@ -5762,7 +5884,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
RI.getRegClass(RsrcRC))) {
// The operands are legal.
- // FIXME: We may need to legalize operands besided srsrc.
+ // FIXME: We may need to legalize operands besides srsrc.
return CreatedBB;
}
@@ -5836,7 +5958,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
- // Atomics rith return have have an additional tied operand and are
+ // Atomics with return have an additional tied operand and are
// missing some of the special bits.
MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
MachineInstr *Addr64;
@@ -6050,7 +6172,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
.addReg(EXEC)
.addReg(IsSCC ? VCC : CondReg);
- Inst.RemoveOperand(1);
+ Inst.removeOperand(1);
}
break;
@@ -6060,6 +6182,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
case AMDGPU::S_PACK_LL_B32_B16:
case AMDGPU::S_PACK_LH_B32_B16:
+ case AMDGPU::S_PACK_HL_B32_B16:
case AMDGPU::S_PACK_HH_B32_B16:
movePackToVALU(Worklist, MRI, Inst);
Inst.eraseFromParent();
@@ -6217,7 +6340,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
if (Op.isUse())
addSCCDefsToVALUWorklist(Op, Worklist);
- Inst.RemoveOperand(i);
+ Inst.removeOperand(i);
}
}
@@ -6247,7 +6370,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
- Inst.RemoveOperand(2); // Remove old immediate.
+ Inst.removeOperand(2); // Remove old immediate.
Inst.addOperand(MachineOperand::CreateImm(Offset));
Inst.addOperand(MachineOperand::CreateImm(BitWidth));
}
@@ -6281,7 +6404,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
// these are deleted later, but at -O0 it would leave a suspicious
// looking illegal copy of an undef register.
for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
- Inst.RemoveOperand(I);
+ Inst.removeOperand(I);
Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
continue;
}
@@ -6323,7 +6446,7 @@ SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
- Inst.RemoveOperand(3);
+ Inst.removeOperand(3);
Inst.setDesc(get(NewOpc));
Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
@@ -6467,7 +6590,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
// Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
// invert either source and then perform the XOR. If either source is a
// scalar register, then we can leave the inversion on the scalar unit to
- // acheive a better distrubution of scalar and vector instructions.
+ // achieve a better distribution of scalar and vector instructions.
bool Src0IsSGPR = Src0.isReg() &&
RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
bool Src1IsSGPR = Src1.isReg() &&
@@ -6689,7 +6812,7 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
legalizeOperands(*LoHalf, MDT);
legalizeOperands(*HiHalf, MDT);
- // Move all users of this moved vlaue.
+ // Move all users of this moved value.
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
@@ -6753,7 +6876,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
Worklist.insert(&LoHalf);
Worklist.insert(&HiHalf);
- // Move all users of this moved vlaue.
+ // Move all users of this moved value.
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
@@ -6831,7 +6954,7 @@ void SIInstrInfo::splitScalar64BitBCNT(
MRI.replaceRegWith(Dest.getReg(), ResultReg);
- // We don't need to legalize operands here. src0 for etiher instruction can be
+ // We don't need to legalize operands here. src0 for either instruction can be
// an SGPR, and the second input is unused or determined here.
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
@@ -6973,6 +7096,17 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
.add(Src1);
break;
}
+ case AMDGPU::S_PACK_HL_B32_B16: {
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+ .addImm(16)
+ .add(Src0);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
+ .add(Src1)
+ .addImm(16)
+ .addReg(TmpReg, RegState::Kill);
+ break;
+ }
case AMDGPU::S_PACK_HH_B32_B16: {
Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -7045,7 +7179,7 @@ void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op,
assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse());
MachineInstr *SCCUseInst = Op.getParent();
- // Look for a preceeding instruction that either defines VCC or SCC. If VCC
+ // Look for a preceding instruction that either defines VCC or SCC. If VCC
// then there is nothing to do because the defining instruction has been
// converted to a VALU already. If SCC then that instruction needs to be
// converted to a VALU.
@@ -7191,7 +7325,10 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
- return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) |
+ int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11 ?
+ AMDGPU::UfmtGFX11::UFMT_32_FLOAT :
+ AMDGPU::UfmtGFX10::UFMT_32_FLOAT;
+ return (Format << 44) |
(1ULL << 56) | // RESOURCE_LEVEL = 1
(3ULL << 60); // OOB_SELECT = 3
}
@@ -7332,7 +7469,9 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return DescSize;
bool HasLiteral = false;
for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
- if (isLiteralConstant(MI, I)) {
+ const MachineOperand &Op = MI.getOperand(I);
+ const MCOperandInfo &OpInfo = Desc.OpInfo[I];
+ if (isLiteralConstantLike(Op, OpInfo)) {
HasLiteral = true;
break;
}
@@ -7513,6 +7652,16 @@ SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
return makeArrayRef(TargetFlags);
}
+ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
+SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
+ static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
+ {
+ {MONoClobber, "amdgpu-noclobber"},
+ };
+
+ return makeArrayRef(TargetFlags);
+}
+
bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
MI.modifiesRegister(AMDGPU::EXEC, &RI);
@@ -7690,6 +7839,7 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
}
// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
+// and the columns of the getMCOpcodeGen table.
enum SIEncodingFamily {
SI = 0,
VI = 1,
@@ -7699,7 +7849,9 @@ enum SIEncodingFamily {
GFX9 = 5,
GFX10 = 6,
SDWA10 = 7,
- GFX90A = 8
+ GFX90A = 8,
+ GFX940 = 9,
+ GFX11 = 10,
};
static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
@@ -7714,6 +7866,8 @@ static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
return SIEncodingFamily::VI;
case AMDGPUSubtarget::GFX10:
return SIEncodingFamily::GFX10;
+ case AMDGPUSubtarget::GFX11:
+ return SIEncodingFamily::GFX11;
}
llvm_unreachable("Unknown subtarget generation!");
}
@@ -7779,6 +7933,9 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
if (ST.hasGFX90AInsts()) {
uint16_t NMCOp = (uint16_t)-1;
+ if (ST.hasGFX940Insts())
+ NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940);
+ if (NMCOp == (uint16_t)-1)
NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
if (NMCOp == (uint16_t)-1)
NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
@@ -7925,7 +8082,7 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
auto &UseInst = *Use.getParent();
// Don't bother searching between blocks, although it is possible this block
// doesn't modify exec.
- if (UseInst.getParent() != DefBB)
+ if (UseInst.getParent() != DefBB || UseInst.isPHI())
return true;
if (++NumUse > MaxUseScan)
@@ -8150,7 +8307,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
this](int64_t ExpectedValue, unsigned SrcSize,
- bool IsReversable, bool IsSigned) -> bool {
+ bool IsReversible, bool IsSigned) -> bool {
// s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
// s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
// s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
@@ -8208,7 +8365,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
bool IsReversedCC = false;
if (CmpValue != ExpectedValue) {
- if (!IsReversable)
+ if (!IsReversible)
return false;
IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
if (!IsReversedCC)
@@ -8284,3 +8441,37 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
return false;
}
+
+void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,
+ unsigned OpName) const {
+ if (!ST.needsAlignedVGPRs())
+ return;
+
+ int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
+ if (OpNo < 0)
+ return;
+ MachineOperand &Op = MI.getOperand(OpNo);
+ if (getOpSize(MI, OpNo) > 4)
+ return;
+
+ // Add implicit aligned super-reg to force alignment on the data operand.
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *BB = MI.getParent();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ Register DataReg = Op.getReg();
+ bool IsAGPR = RI.isAGPR(MRI, DataReg);
+ Register Undef = MRI.createVirtualRegister(
+ IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
+ BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
+ Register NewVR =
+ MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
+ : &AMDGPU::VReg_64_Align2RegClass);
+ BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
+ .addReg(DataReg, 0, Op.getSubReg())
+ .addImm(AMDGPU::sub0)
+ .addReg(Undef)
+ .addImm(AMDGPU::sub1);
+ Op.setReg(NewVR);
+ Op.setSubReg(AMDGPU::sub0);
+ MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
+}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e551d6c7223f..311f9f68e675 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -15,6 +15,7 @@
#define LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
#include "AMDGPUMIRFormatter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIRegisterInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SetVector.h"
@@ -35,6 +36,11 @@ class RegScavenger;
class TargetRegisterClass;
class ScheduleHazardRecognizer;
+/// Mark the MMO of a uniform load if there are no potentially clobbering stores
+/// on any path from the start of an entry function to this load.
+static const MachineMemOperand::Flags MONoClobber =
+ MachineMemOperand::MOTargetFlag1;
+
class SIInstrInfo final : public AMDGPUGenInstrInfo {
private:
const SIRegisterInfo RI;
@@ -323,15 +329,14 @@ public:
Register SrcReg2, int64_t CmpMask, int64_t CmpValue,
const MachineRegisterInfo *MRI) const override;
- unsigned getAddressSpaceForPseudoSourceKind(
- unsigned Kind) const override;
-
bool
areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
const MachineInstr &MIb) const override;
static bool isFoldableCopy(const MachineInstr &MI);
+ void removeModOperands(MachineInstr &MI) const;
+
bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
MachineRegisterInfo *MRI) const final;
@@ -549,6 +554,14 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
}
+ static bool isDualSourceBlendEXP(const MachineInstr &MI) {
+ if (!isEXP(MI))
+ return false;
+ unsigned Target = MI.getOperand(0).getImm();
+ return Target == AMDGPU::Exp::ET_DUAL_SRC_BLEND0 ||
+ Target == AMDGPU::Exp::ET_DUAL_SRC_BLEND1;
+ }
+
bool isEXP(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::EXP;
}
@@ -651,14 +664,43 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::IsMAI;
}
+ static bool isMFMA(const MachineInstr &MI) {
+ return isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
+ }
+
static bool isDOT(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
}
+ static bool isWMMA(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsWMMA;
+ }
+
+ bool isWMMA(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsWMMA;
+ }
+
bool isDOT(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
}
+ static bool isLDSDIR(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::LDSDIR;
+ }
+
+ bool isLDSDIR(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::LDSDIR;
+ }
+
+ static bool isVINTERP(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VINTERP;
+ }
+
+ bool isVINTERP(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VINTERP;
+ }
+
static bool isScalarUnit(const MachineInstr &MI) {
return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
}
@@ -1036,6 +1078,9 @@ public:
ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const override;
+ ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
+ getSerializableMachineMemOperandTargetFlags() const override;
+
ScheduleHazardRecognizer *
CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
const ScheduleDAG *DAG) const override;
@@ -1132,6 +1177,11 @@ public:
static unsigned getDSShaderTypeValue(const MachineFunction &MF);
const TargetSchedModel &getSchedModel() const { return SchedModel; }
+
+ // Enforce operand's \p OpName even alignment if required by target.
+ // This is used if an operand is a 32 bit register but needs to be aligned
+ // regardless.
+ void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
};
/// \brief Returns true if a reg:subreg pair P has a TRC class
@@ -1210,9 +1260,6 @@ namespace AMDGPU {
int getIfAddr64Inst(uint16_t Opcode);
LLVM_READONLY
- int getMUBUFNoLdsInst(uint16_t Opcode);
-
- LLVM_READONLY
int getAtomicNoRetOp(uint16_t Opcode);
LLVM_READONLY
@@ -1236,6 +1283,11 @@ namespace AMDGPU {
LLVM_READONLY
int getFlatScratchInstSTfromSS(uint16_t Opcode);
+ /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
+ /// of an SVS (SADDR + VADDR) form.
+ LLVM_READONLY
+ int getFlatScratchInstSVfromSVS(uint16_t Opcode);
+
/// \returns SS (SADDR) form of a FLAT Scratch instruction given an \p Opcode
/// of an SV (VADDR) form.
LLVM_READONLY
@@ -1250,6 +1302,10 @@ namespace AMDGPU {
LLVM_READONLY
int getMFMAEarlyClobberOp(uint16_t Opcode);
+ /// \returns v_cmpx version of a v_cmp instruction.
+ LLVM_READONLY
+ int getVCMPXOpFromVCMP(uint16_t Opcode);
+
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 713a08907e99..29ee9f12b12d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===//
+//===-- SIInstrInfo.td -----------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -17,7 +17,8 @@ class GCNPredicateControl : PredicateControl {
}
// Except for the NONE field, this must be kept in sync with the
-// SIEncodingFamily enum in AMDGPUInstrInfo.cpp
+// SIEncodingFamily enum in SIInstrInfo.cpp and the columns of the
+// getMCOpcodeGen table.
def SIEncodingFamily {
int NONE = -1;
int SI = 0;
@@ -29,6 +30,8 @@ def SIEncodingFamily {
int GFX10 = 6;
int SDWA10 = 7;
int GFX90A = 8;
+ int GFX940 = 9;
+ int GFX11 = 10;
}
//===----------------------------------------------------------------------===//
@@ -190,6 +193,44 @@ def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
+multiclass SDBufferAtomicRetNoRet {
+ def "_ret" : PatFrag<
+ (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset,
+ node:$offset, node:$cachepolicy, node:$idxen),
+ (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex,
+ node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
+ node:$idxen)> {
+ let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }];
+ let GISelPredicateCode = [{ return true; }];
+ }
+
+ def "_noret" : PatFrag<
+ (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset,
+ node:$offset, node:$cachepolicy, node:$idxen),
+ (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex,
+ node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
+ node:$idxen)> {
+ let PredicateCode = [{ return SDValue(N, 0).use_empty(); }];
+ let GISelPredicateCode = [{ return false; }];
+ }
+}
+
+defm SIbuffer_atomic_swap : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_add : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_sub : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_smin : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_umin : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_smax : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_umax : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_and : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_or : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_xor : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_inc : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_dec : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_fadd : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_fmin : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_fmax : SDBufferAtomicRetNoRet;
+
def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
SDTypeProfile<1, 9,
[SDTCisVT<0, i32>, // dst
@@ -205,6 +246,26 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
+def SIbuffer_atomic_cmpswap_ret : PatFrag<
+ (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset,
+ node:$soffset, node:$offset, node:$cachepolicy, node:$idxen),
+ (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex,
+ node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
+ node:$idxen)> {
+ let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }];
+ let GISelPredicateCode = [{ return true; }];
+}
+
+def SIbuffer_atomic_cmpswap_noret : PatFrag<
+ (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset,
+ node:$soffset, node:$offset, node:$cachepolicy, node:$idxen),
+ (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex,
+ node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
+ node:$idxen)> {
+ let PredicateCode = [{ return SDValue(N, 0).use_empty(); }];
+ let GISelPredicateCode = [{ return false; }];
+}
+
class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode,
SDTypeProfile<0, 2,
[SDTCisPtrTy<0>, // vaddr
@@ -255,35 +316,57 @@ def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE",
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
>;
+def SIfptrunc_round_upward : SDNode<"AMDGPUISD::FPTRUNC_ROUND_UPWARD",
+ SDTFPRoundOp
+>;
+
+def SIfptrunc_round_downward : SDNode<"AMDGPUISD::FPTRUNC_ROUND_DOWNWARD",
+ SDTFPRoundOp
+>;
+
//===----------------------------------------------------------------------===//
// ValueType helpers
//===----------------------------------------------------------------------===//
// Returns 1 if the source arguments have modifiers, 0 if they do not.
-// XXX - do f16 instructions?
class isFloatType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, f16.Value),
!eq(SrcVT.Value, f32.Value),
!eq(SrcVT.Value, f64.Value),
!eq(SrcVT.Value, v2f16.Value),
!eq(SrcVT.Value, v4f16.Value),
+ !eq(SrcVT.Value, v8f16.Value),
+ !eq(SrcVT.Value, v16f16.Value),
!eq(SrcVT.Value, v2f32.Value),
+ !eq(SrcVT.Value, v4f32.Value),
+ !eq(SrcVT.Value, v8f32.Value),
!eq(SrcVT.Value, v2f64.Value),
!eq(SrcVT.Value, v4f64.Value));
}
+// XXX - do v2i16 instructions?
class isIntType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, i16.Value),
!eq(SrcVT.Value, i32.Value),
!eq(SrcVT.Value, i64.Value),
- !eq(SrcVT.Value, v2i32.Value));
+ !eq(SrcVT.Value, v4i16.Value),
+ !eq(SrcVT.Value, v8i16.Value),
+ !eq(SrcVT.Value, v16i16.Value),
+ !eq(SrcVT.Value, v2i32.Value),
+ !eq(SrcVT.Value, v4i32.Value),
+ !eq(SrcVT.Value, v8i32.Value));
}
class isPackedType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, v2i16.Value),
!eq(SrcVT.Value, v2f16.Value),
!eq(SrcVT.Value, v4f16.Value),
- !eq(SrcVT.Value, v2f32.Value));
+ !eq(SrcVT.Value, v2i32.Value),
+ !eq(SrcVT.Value, v2f32.Value),
+ !eq(SrcVT.Value, v4i32.Value),
+ !eq(SrcVT.Value, v4f32.Value),
+ !eq(SrcVT.Value, v8i32.Value),
+ !eq(SrcVT.Value, v8f32.Value));
}
@@ -291,19 +374,10 @@ class isPackedType<ValueType SrcVT> {
// PatFrags for global memory operations
//===----------------------------------------------------------------------===//
-foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
-let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
-
-
-defm atomic_inc_#as : binary_atomic_op<SIatomic_inc>;
-defm atomic_dec_#as : binary_atomic_op<SIatomic_dec>;
-defm atomic_load_fmin_#as : binary_atomic_op<SIatomic_fmin, 0>;
-defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>;
-
-
-} // End let AddressSpaces = ...
-} // End foreach AddrSpace
-
+defm atomic_inc : binary_atomic_op_all_as<SIatomic_inc>;
+defm atomic_dec : binary_atomic_op_all_as<SIatomic_dec>;
+defm atomic_load_fmin : binary_atomic_op_all_as<SIatomic_fmin, 0>;
+defm atomic_load_fmax : binary_atomic_op_all_as<SIatomic_fmax, 0>;
//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
@@ -408,50 +482,36 @@ def load_local_m0 : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> {
let IsNonExtLoad = 1;
}
-let MemoryVT = i8 in {
def extloadi8_local_m0 : PatFrag<(ops node:$ptr), (extloadi8_glue node:$ptr)>;
def sextloadi8_local_m0 : PatFrag<(ops node:$ptr), (sextloadi8_glue node:$ptr)>;
def zextloadi8_local_m0 : PatFrag<(ops node:$ptr), (zextloadi8_glue node:$ptr)>;
-}
-let MemoryVT = i16 in {
def extloadi16_local_m0 : PatFrag<(ops node:$ptr), (extloadi16_glue node:$ptr)>;
def sextloadi16_local_m0 : PatFrag<(ops node:$ptr), (sextloadi16_glue node:$ptr)>;
def zextloadi16_local_m0 : PatFrag<(ops node:$ptr), (zextloadi16_glue node:$ptr)>;
-}
+} // End IsLoad = 1, , AddressSpaces = LoadAddress_local.AddrSpaces
def load_align8_local_m0 : PatFrag<(ops node:$ptr),
- (load_local_m0 node:$ptr)>, Aligned<8> {
+ (load_local_m0 node:$ptr)> {
let IsLoad = 1;
- let IsNonExtLoad = 1;
+ int MinAlignment = 8;
}
def load_align16_local_m0 : PatFrag<(ops node:$ptr),
- (load_local_m0 node:$ptr)>, Aligned<16> {
+ (load_local_m0 node:$ptr)> {
let IsLoad = 1;
- let IsNonExtLoad = 1;
+ int MinAlignment = 16;
}
-} // End IsLoad = 1
-
let IsAtomic = 1, AddressSpaces = LoadAddress_local.AddrSpaces in {
def atomic_load_8_local_m0 : PatFrag<(ops node:$ptr),
- (atomic_load_8_glue node:$ptr)> {
- let MemoryVT = i8;
-}
+ (atomic_load_8_glue node:$ptr)>;
def atomic_load_16_local_m0 : PatFrag<(ops node:$ptr),
- (atomic_load_16_glue node:$ptr)> {
- let MemoryVT = i16;
-}
+ (atomic_load_16_glue node:$ptr)>;
def atomic_load_32_local_m0 : PatFrag<(ops node:$ptr),
- (atomic_load_32_glue node:$ptr)> {
- let MemoryVT = i32;
-}
+ (atomic_load_32_glue node:$ptr)>;
def atomic_load_64_local_m0 : PatFrag<(ops node:$ptr),
- (atomic_load_64_glue node:$ptr)> {
- let MemoryVT = i64;
-}
-
+ (atomic_load_64_glue node:$ptr)>;
} // End let AddressSpaces = LoadAddress_local.AddrSpaces
@@ -485,75 +545,103 @@ def truncstorei8_glue : PatFrag<(ops node:$val, node:$ptr),
(truncstore_glue node:$val, node:$ptr)> {
let IsStore = 1;
let MemoryVT = i8;
+ let IsTruncStore = 1;
}
def truncstorei16_glue : PatFrag<(ops node:$val, node:$ptr),
(truncstore_glue node:$val, node:$ptr)> {
let IsStore = 1;
let MemoryVT = i16;
+ let IsTruncStore = 1;
}
let IsStore = 1, AddressSpaces = StoreAddress_local.AddrSpaces in {
def store_local_m0 : PatFrag<(ops node:$val, node:$ptr),
- (store_glue node:$val, node:$ptr)> {
- let IsStore = 1;
- let IsTruncStore = 0;
-}
-
+ (store_glue node:$val, node:$ptr)>;
def truncstorei8_local_m0 : PatFrag<(ops node:$val, node:$ptr),
- (unindexedstore_glue node:$val, node:$ptr)> {
- let IsStore = 1;
- let MemoryVT = i8;
-}
-
+ (truncstorei8_glue node:$val, node:$ptr)>;
def truncstorei16_local_m0 : PatFrag<(ops node:$val, node:$ptr),
- (unindexedstore_glue node:$val, node:$ptr)> {
- let IsStore = 1;
- let MemoryVT = i16;
-}
+ (truncstorei16_glue node:$val, node:$ptr)>;
}
def store_align8_local_m0 : PatFrag <(ops node:$value, node:$ptr),
(store_local_m0 node:$value, node:$ptr)>,
Aligned<8> {
let IsStore = 1;
- let IsTruncStore = 0;
}
def store_align16_local_m0 : PatFrag <(ops node:$value, node:$ptr),
(store_local_m0 node:$value, node:$ptr)>,
Aligned<16> {
let IsStore = 1;
+}
+
+let PredicateCode = [{return cast<MemSDNode>(N)->getAlignment() < 4;}],
+ GISelPredicateCode = [{return (*MI.memoperands_begin())->getAlign() < 4;}],
+ AddressSpaces = [ AddrSpaces.Local ] in {
+def load_align_less_than_4_local : PatFrag<(ops node:$ptr),
+ (load_local node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+}
+
+def load_align_less_than_4_local_m0 : PatFrag<(ops node:$ptr),
+ (load_local_m0 node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+}
+
+def store_align_less_than_4_local : PatFrag <(ops node:$value, node:$ptr),
+ (store_local node:$value, node:$ptr)> {
+ let IsStore = 1;
let IsTruncStore = 0;
}
-let AddressSpaces = StoreAddress_local.AddrSpaces in {
+def store_align_less_than_4_local_m0 : PatFrag <(ops node:$value, node:$ptr),
+ (store_local_m0 node:$value, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+}
+}
-def atomic_store_local_8_m0 : PatFrag <
- (ops node:$value, node:$ptr),
- (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+def atomic_store_8_glue : PatFrag <
+ (ops node:$ptr, node:$value),
+ (AMDGPUatomic_st_glue node:$ptr, node:$value)> {
let IsAtomic = 1;
let MemoryVT = i8;
}
-def atomic_store_local_16_m0 : PatFrag <
- (ops node:$value, node:$ptr),
- (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+
+def atomic_store_16_glue : PatFrag <
+ (ops node:$ptr, node:$value),
+ (AMDGPUatomic_st_glue node:$ptr, node:$value)> {
let IsAtomic = 1;
let MemoryVT = i16;
}
-def atomic_store_local_32_m0 : PatFrag <
- (ops node:$value, node:$ptr),
- (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+
+def atomic_store_32_glue : PatFrag <
+ (ops node:$ptr, node:$value),
+ (AMDGPUatomic_st_glue node:$ptr, node:$value)> {
let IsAtomic = 1;
let MemoryVT = i32;
}
-def atomic_store_local_64_m0 : PatFrag <
- (ops node:$value, node:$ptr),
- (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+
+def atomic_store_64_glue : PatFrag <
+ (ops node:$ptr, node:$value),
+ (AMDGPUatomic_st_glue node:$ptr, node:$value)> {
let IsAtomic = 1;
let MemoryVT = i64;
}
-} // End let AddressSpaces = StoreAddress_local.AddrSpaces
+
+let IsAtomic = 1, AddressSpaces = StoreAddress_local.AddrSpaces in {
+def atomic_store_8_local_m0 : PatFrag<(ops node:$ptr, node:$val),
+ (atomic_store_8_glue node:$ptr, node:$val)>;
+def atomic_store_16_local_m0 : PatFrag<(ops node:$ptr, node:$val),
+ (atomic_store_16_glue node:$ptr, node:$val)>;
+def atomic_store_32_local_m0 : PatFrag<(ops node:$ptr, node:$val),
+ (atomic_store_32_glue node:$ptr, node:$val)>;
+def atomic_store_64_local_m0 : PatFrag<(ops node:$ptr, node:$val),
+ (atomic_store_64_glue node:$ptr, node:$val)>;
+} // End let IsAtomic = 1, AddressSpaces = StoreAddress_local.AddrSpaces
def si_setcc_uniform : PatFrag <
@@ -686,10 +774,14 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
let AddressSpaces = StoreAddress_local.AddrSpaces in {
defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
+ defm _local_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
+ IsInt>;
}
let AddressSpaces = StoreAddress_region.AddrSpaces in {
defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
+ defm _region_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
+ IsInt>;
}
}
@@ -954,6 +1046,18 @@ def SWaitMatchClass : AsmOperandClass {
let ParserMethod = "parseSWaitCntOps";
}
+def DepCtrMatchClass : AsmOperandClass {
+ let Name = "DepCtr";
+ let RenderMethod = "addImmOperands";
+ let ParserMethod = "parseDepCtrOps";
+}
+
+def SDelayMatchClass : AsmOperandClass {
+ let Name = "SDelayAlu";
+ let RenderMethod = "addImmOperands";
+ let ParserMethod = "parseSDelayAluOps";
+}
+
def VReg32OrOffClass : AsmOperandClass {
let Name = "VReg32OrOff";
let ParserMethod = "parseVReg32OrOff";
@@ -979,6 +1083,16 @@ def WAIT_FLAG : Operand <i32> {
let ParserMatchClass = SWaitMatchClass;
let PrintMethod = "printWaitFlag";
}
+
+def DepCtrImm : Operand <i32> {
+ let ParserMatchClass = DepCtrMatchClass;
+ let PrintMethod = "printDepCtr";
+}
+
+def DELAY_FLAG : Operand <i32> {
+ let ParserMatchClass = SDelayMatchClass;
+ let PrintMethod = "printDelayFlag";
+}
} // End OperandType = "OPERAND_IMMEDIATE"
include "SIInstrFormats.td"
@@ -1163,14 +1277,6 @@ def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT", 0>>;
def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
def Dim : NamedOperandU8<"Dim", NamedMatchClass<"Dim", 0>>;
-def dpp8 : NamedOperandU32<"DPP8", NamedMatchClass<"DPP8", 0>>;
-
-def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
-def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
-def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>;
-def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>;
-def FI : NamedOperandU32<"FI", NamedMatchClass<"FI">>;
-
def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>;
def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>;
@@ -1181,6 +1287,14 @@ def op_sel_hi0 : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>;
def neg_lo0 : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>;
def neg_hi0 : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>;
+def dpp8 : NamedOperandU32<"DPP8", NamedMatchClass<"DPP8", 0>>;
+def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
+
+def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
+def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>;
+def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>;
+def FI : NamedOperandU32<"FI", NamedMatchClass<"FI">>;
+
def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>;
def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>;
def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>;
@@ -1191,6 +1305,9 @@ def exp_tgt : NamedOperandU32<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
}
+def wait_vdst : NamedOperandU8<"WaitVDST", NamedMatchClass<"WaitVDST">>;
+def wait_exp : NamedOperandU8<"WaitEXP", NamedMatchClass<"WaitEXP">>;
+
} // End OperandType = "OPERAND_IMMEDIATE"
class KImmMatchClass<int size> : AsmOperandClass {
@@ -1223,10 +1340,18 @@ class FPInputModsMatchClass <int opSize> : AsmOperandClass {
let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods";
}
+class FPVCSrcInputModsMatchClass <int opSize> : FPInputModsMatchClass <opSize> {
+ let Name = "RegOrInlineImmWithFP"#opSize#"InputMods";
+ let PredicateMethod = "isRegOrInlineImmWithFP"#opSize#"InputMods";
+}
+
def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
+def FP16VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<16>;
+def FP32VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<32>;
+
class InputMods <AsmOperandClass matchClass> : Operand <i32> {
let OperandNamespace = "AMDGPU";
let OperandType = "OPERAND_INPUT_MODS";
@@ -1241,19 +1366,28 @@ def FP16InputMods : FPInputMods<FP16InputModsMatchClass>;
def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
+def FP16VCSrcInputMods : FPInputMods<FP16VCSrcInputModsMatchClass>;
+def FP32VCSrcInputMods : FPInputMods<FP32VCSrcInputModsMatchClass>;
+
class IntInputModsMatchClass <int opSize> : AsmOperandClass {
let Name = "RegOrImmWithInt"#opSize#"InputMods";
let ParserMethod = "parseRegOrImmWithIntInputMods";
let PredicateMethod = "isRegOrImmWithInt"#opSize#"InputMods";
}
+class IntVCSrcInputModsMatchClass <int opSize> : IntInputModsMatchClass <opSize> {
+ let Name = "RegOrInlineImmWithInt"#opSize#"InputMods";
+ let PredicateMethod = "isRegOrInlineImmWithInt"#opSize#"InputMods";
+}
def Int32InputModsMatchClass : IntInputModsMatchClass<32>;
def Int64InputModsMatchClass : IntInputModsMatchClass<64>;
+def Int32VCSrcInputModsMatchClass : IntVCSrcInputModsMatchClass<32>;
class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> {
let PrintMethod = "printOperandAndIntInputMods";
}
def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
+def Int32VCSrcInputMods : IntInputMods<Int32VCSrcInputModsMatchClass>;
class OpSelModsMatchClass : AsmOperandClass {
let Name = "OpSelMods";
@@ -1366,12 +1500,19 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
+def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
+def DotIUVOP3PMods : ComplexPattern<untyped, 1, "SelectDotIUVOP3PMods">;
+def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
+
def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
+def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
+def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
+
//===----------------------------------------------------------------------===//
// SI assembler operands
//===----------------------------------------------------------------------===//
@@ -1575,6 +1716,19 @@ class getVOP3SrcForVT<ValueType VT> {
);
}
+// Src2 of VOP3 DPP instructions cannot be a literal
+class getVOP3DPPSrcForVT<ValueType VT> {
+ bit isFP = isFloatType<VT>.ret;
+ RegisterOperand ret =
+ !if (!eq(VT.Value, i1.Value), SSrc_i1,
+ !if (isFP,
+ !if (!eq(VT.Value, f16.Value), VCSrc_f16,
+ !if (!eq(VT.Value, v2f16.Value), VCSrc_v2f16, VCSrc_f32)),
+ !if (!eq(VT.Value, i16.Value), VCSrc_b16,
+ !if (!eq(VT.Value, v2i16.Value), VCSrc_v2b16,
+ VCSrc_b32))));
+}
+
// Float or packed int
class isModifierType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, f16.Value),
@@ -1583,7 +1737,17 @@ class isModifierType<ValueType SrcVT> {
!eq(SrcVT.Value, v2f16.Value),
!eq(SrcVT.Value, v2i16.Value),
!eq(SrcVT.Value, v2f32.Value),
- !eq(SrcVT.Value, v2i32.Value));
+ !eq(SrcVT.Value, v2i32.Value),
+ !eq(SrcVT.Value, v4f16.Value),
+ !eq(SrcVT.Value, v4i16.Value),
+ !eq(SrcVT.Value, v4f32.Value),
+ !eq(SrcVT.Value, v4i32.Value),
+ !eq(SrcVT.Value, v8f16.Value),
+ !eq(SrcVT.Value, v8i16.Value),
+ !eq(SrcVT.Value, v8f32.Value),
+ !eq(SrcVT.Value, v8i32.Value),
+ !eq(SrcVT.Value, v16f16.Value),
+ !eq(SrcVT.Value, v16i16.Value));
}
// Return type of input modifiers operand for specified input operand
@@ -1611,6 +1775,17 @@ class getSrcModDPP <ValueType VT> {
Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
}
+// Return type of input modifiers operand for specified input operand for DPP
+class getSrcModVOP3DPP <ValueType VT, bit EnableF32SrcMods> {
+ bit isFP = isFloatType<VT>.ret;
+ bit isPacked = isPackedType<VT>.ret;
+ Operand ret =
+ !if (isFP,
+ !if (!eq(VT.Value, f16.Value), FP16VCSrcInputMods,
+ FP32VCSrcInputMods),
+ !if (EnableF32SrcMods, FP32VCSrcInputMods, Int32VCSrcInputMods));
+}
+
// Return type of input modifiers operand specified input operand for SDWA
class getSrcModSDWA <ValueType VT> {
Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods,
@@ -1620,7 +1795,7 @@ class getSrcModSDWA <ValueType VT> {
}
// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
-class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
+class getIns32 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs> {
dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1
!if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2
(ins)));
@@ -1715,19 +1890,21 @@ class getInsVOP3Base<RegisterOperand Src0RC, RegisterOperand Src1RC,
HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
Src0Mod, Src1Mod, Src2Mod>.ret;
dag opsel = (ins op_sel0:$op_sel);
- dag vop3pFields = (ins op_sel_hi0:$op_sel_hi, neg_lo0:$neg_lo, neg_hi0:$neg_hi);
+ dag vop3pOpsel = (ins op_sel_hi0:$op_sel_hi);
+ dag vop3pFields = !con(!if(HasOpSel, vop3pOpsel, (ins)), (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi));
+
dag ret = !con(base,
!if(HasOpSel, opsel,(ins)),
!if(IsVOP3P, vop3pFields,(ins)));
}
class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
- RegisterOperand Src2RC, int NumSrcArgs, bit HasClamp,
+ RegisterOperand Src2RC, int NumSrcArgs, bit HasClamp, bit HasOpSel,
Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
dag ret = getInsVOP3Base<Src0RC, Src1RC, Src2RC, NumSrcArgs,
HasClamp, 1/*HasModifiers*/, 1/*HasSrc2Mods*/,
0/*HasOMod*/, Src0Mod, Src1Mod, Src2Mod,
- 1/*HasOpSel*/, 1/*IsVOP3P*/>.ret;
+ HasOpSel, 1/*IsVOP3P*/>.ret;
}
class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC,
@@ -1741,8 +1918,8 @@ class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC,
}
class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
- int NumSrcArgs, bit HasModifiers,
- Operand Src0Mod, Operand Src1Mod> {
+ RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
dag ret = !if (!eq(NumSrcArgs, 0),
// VOP1 without input operands (V_NOP)
@@ -1756,6 +1933,7 @@ class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass
// VOP1_DPP without modifiers
(ins OldRC:$old, Src0RC:$src0)
/* endif */),
+ !if (!eq(NumSrcArgs, 2),
!if (HasModifiers,
// VOP2_DPP with modifiers
(ins OldRC:$old,
@@ -1765,34 +1943,72 @@ class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass
// VOP2_DPP without modifiers
(ins OldRC:$old,
Src0RC:$src0, Src1RC:$src1)
- )));
+ )
+ /* NumSrcArgs == 3, VOP3 */,
+ !if (HasModifiers,
+ // VOP3_DPP with modifiers
+ (ins OldRC:$old,
+ Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2Mod:$src2_modifiers, Src2RC:$src2)
+ /* else */,
+ // VOP3_DPP without modifiers
+ (ins OldRC:$old,
+ Src0RC:$src0, Src1RC:$src1,
+ Src2RC:$src2)
+ )
+ /* endif */)));
}
class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
- int NumSrcArgs, bit HasModifiers,
- Operand Src0Mod, Operand Src1Mod> {
- dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, NumSrcArgs,
- HasModifiers, Src0Mod, Src1Mod>.ret,
+ RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+ dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
+ HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret,
(ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
- bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
}
class getInsDPP16 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
- int NumSrcArgs, bit HasModifiers,
- Operand Src0Mod, Operand Src1Mod> {
- dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, NumSrcArgs,
- HasModifiers, Src0Mod, Src1Mod>.ret,
+ RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+ dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
+ HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret,
(ins FI:$fi));
}
class getInsDPP8 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
- int NumSrcArgs, bit HasModifiers,
- Operand Src0Mod, Operand Src1Mod> {
- dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, NumSrcArgs,
- HasModifiers, Src0Mod, Src1Mod>.ret,
+ RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+ dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
+ HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret,
(ins dpp8:$dpp8, FI:$fi));
}
+class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
+ dag old = ( ins OldRC:$old );
+ dag base = VOP3Base;
+ dag ret = !con(
+ !if(!ne(NumSrcArgs, 0), old, (ins)),
+ base
+ );
+}
+
+class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
+ dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs>.ret,
+ (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
+}
+
+class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
+ dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs>.ret,
+ (ins FI:$fi));
+}
+
+class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
+ dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs>.ret,
+ (ins dpp8:$dpp8, FI:$fi));
+}
// Ins for SDWA
class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,
@@ -1870,6 +2086,15 @@ class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
!if(!eq(NumSrcArgs, 3), src0#src1#src2, "");
}
+class getAsmVOPDPart <int NumSrcArgs, string XorY> {
+ string dst = "$vdst" # XorY;
+ string src0 = ", $src0" # XorY;
+ string src1 = ", $vsrc1" # XorY;
+ string ret = dst #
+ !if(!ge(NumSrcArgs, 1), src0, "") #
+ !if(!ge(NumSrcArgs, 2), src1, "");
+}
+
// Returns the assembly string for the inputs and outputs of a VOP3
// instruction.
class getAsm64 <bit HasDst, int NumSrcArgs, bit HasIntClamp, bit HasModifiers,
@@ -1890,7 +2115,7 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasIntClamp, bit HasModifiers,
// Returns the assembly string for the inputs and outputs of a VOP3P
// instruction.
class getAsmVOP3P <int NumSrcArgs, bit HasModifiers,
- bit HasClamp> {
+ bit HasClamp, bit HasOpSel> {
string dst = "$vdst";
string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
string src1 = !if(!eq(NumSrcArgs, 1), "",
@@ -1900,10 +2125,11 @@ class getAsmVOP3P <int NumSrcArgs, bit HasModifiers,
string mods = !if(HasModifiers, "$neg_lo$neg_hi", "");
string clamp = !if(HasClamp, "$clamp", "");
+ string opsel = !if(HasOpSel, "$op_sel$op_sel_hi", "");
// Each modifier is printed as an array of bits for each operand, so
// all operands are printed as part of src0_modifiers.
- string ret = dst#", "#src0#src1#src2#"$op_sel$op_sel_hi"#mods#clamp;
+ string ret = dst#", "#src0#src1#src2#opsel#mods#clamp;
}
class getAsmVOP3OpSel <int NumSrcArgs,
@@ -1930,8 +2156,8 @@ class getAsmVOP3OpSel <int NumSrcArgs,
string src2 = !if(Src2HasMods, fsrc2, isrc2);
string clamp = !if(HasClamp, "$clamp", "");
-
- string ret = dst#", "#src0#src1#src2#"$op_sel"#clamp;
+ string omod = "";
+ string ret = dst#", "#src0#src1#src2#"$op_sel"#clamp#omod;
}
class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
@@ -1955,15 +2181,63 @@ class getAsmDPP16 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT
}
class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32>
- : getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT> {
+ : getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>{
let ret = dst#args#" $dpp8$fi";
}
+class getAsmVOP3DPPBase <int NumSrcArgs, bit HasDst, bit HasClamp,
+ bit HasOpSel, bit HasOMod, bit IsVOP3P,
+ bit HasModifiers, bit Src0HasMods,
+ bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32> {
+ string dst = !if(HasDst,
+ !if(!eq(DstVT.Size, 1),
+ "$sdst",
+ "$vdst"),
+ ""); // use $sdst for VOPC
+ string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
+ string isrc1 = !if(!eq(NumSrcArgs, 1), "",
+ !if(!eq(NumSrcArgs, 2), " $src1",
+ " $src1,"));
+ string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", "");
+
+ string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+ string fsrc1 = !if(!eq(NumSrcArgs, 1), "",
+ !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+ " $src1_modifiers,"));
+ string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
+
+ string src0 = !if(Src0HasMods, fsrc0, isrc0);
+ string src1 = !if(Src1HasMods, fsrc1, isrc1);
+ string src2 = !if(Src2HasMods, fsrc2, isrc2);
+ string opsel = !if(HasOpSel, "$op_sel", "");
+ string 3PMods = !if(IsVOP3P,
+ !if(HasOpSel, "$op_sel_hi", "")
+ #!if(HasModifiers, "$neg_lo$neg_hi", ""),
+ "");
+ string clamp = !if(HasClamp, "$clamp", "");
+ string omod = !if(HasOMod, "$omod", "");
+
+ string ret = dst#", "#src0#src1#src2#opsel#3PMods#clamp#omod;
+
+}
+
+class getAsmVOP3DPP<string base> {
+ string ret = base # " $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+}
+
+class getAsmVOP3DPP16<string base> {
+ string ret = getAsmVOP3DPP<base>.ret # "$fi";
+}
+
+class getAsmVOP3DPP8<string base> {
+ string ret = base # " $dpp8$fi";
+}
+
class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
string dst = !if(HasDst,
!if(!eq(DstVT.Size, 1),
- " vcc", // use vcc token as dst for VOPC instructioins
+ " vcc", // use vcc token as dst for VOPC instructions
"$vdst"),
"");
string src0 = "$src0_modifiers";
@@ -2056,6 +2330,12 @@ class getHasDPP <int NumSrcArgs> {
1);
}
+class getHasExt32BitDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+ ValueType Src1VT = i32> {
+ bit ret = !and(getHasDPP<NumSrcArgs>.ret,
+ !not(getHas64BitOps<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret));
+}
+
class getHasExt64BitDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
ValueType Src1VT = i32> {
bit ret = !and(getHasDPP<NumSrcArgs>.ret,
@@ -2089,6 +2369,24 @@ class BitAnd<bit a, bit b> {
bit ret = !if(a, !if(b, 1, 0), 0);
}
+class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32,
+ ValueType Src1VT = i32, ValueType Src2VT = i32> {
+ bit ret = !if(!eq(DstVT.Size, 64),
+ 0, // 64-bit dst No DPP for 64-bit operands
+ !if(!eq(Src0VT.Size, 64),
+ 0, // 64-bit src0
+ !if(!eq(Src1VT.Size, 64),
+ 0, // 64-bit src1
+ !if(!eq(Src2VT.Size, 64),
+ 0, // 64-bit src2
+ 1
+ )
+ )
+ )
+ );
+}
+
+
def PatGenMode {
int NoPattern = 0;
int Pattern = 1;
@@ -2106,15 +2404,20 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
field ValueType Src1VT = ArgVT[2];
field ValueType Src2VT = ArgVT[3];
field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
+ field RegisterOperand DstRC64 = DstRC;
field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;
field RegisterOperand DstRCSDWA = getSDWADstForVT<DstVT>.ret;
field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
- field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;
+ field RegisterOperand Src1RC32 = RegisterOperand<getVregSrcForVT<Src1VT>.ret>;
field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret;
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
+ field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret;
+ field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
+ field RegisterOperand Src1VOP3DPP = VGPRSrc_32;
+ field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
field Operand Src0Mod = getSrcMod<Src0VT, EnableF32SrcMods>.ret;
@@ -2122,6 +2425,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
field Operand Src2Mod = getSrcMod<Src2VT, EnableF32SrcMods>.ret;
field Operand Src0ModDPP = getSrcModDPP<Src0VT>.ret;
field Operand Src1ModDPP = getSrcModDPP<Src1VT>.ret;
+ field Operand Src2ModDPP = getSrcModDPP<Src2VT>.ret;
+ field Operand Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, EnableF32SrcMods>.ret;
field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret;
@@ -2169,15 +2474,20 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
field bit HasSrc2Mods = !if(HasModifiers, !or(HasSrc2FloatMods, HasSrc2IntMods), 0);
field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
- field bit HasExtDPP = getHasDPP<NumSrcArgs>.ret;
+ field bit HasExtVOP3DPP = getHasVOP3DPP<DstVT, Src0VT, Src1VT, Src2VT>.ret;
+ field bit HasExtDPP = !if(!or(getHasDPP<NumSrcArgs>.ret,
+ HasExtVOP3DPP), 1, 0);
+ field bit HasExt32BitDPP = getHasExt32BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExt64BitDPP = getHasExt64BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtSDWA = getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtSDWA9 = HasExtSDWA;
field int NeedPatGen = PatGenMode.NoPattern;
field bit IsMAI = 0;
+ field bit IsVOP3P = 0;
field bit IsDOT = 0;
field bit IsSingle = 0;
+ field bit IsWMMA = 0;
field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
@@ -2188,9 +2498,11 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
// VOP3b instructions are a special case with a second explicit
// output. This is manually overridden for them.
field dag Outs32 = Outs;
- field dag Outs64 = Outs;
+ field dag Outs64 = !if(HasDst,(outs DstRC64:$vdst),(outs));
field dag OutsDPP = getOutsDPP<HasDst, DstVT, DstRCDPP>.ret;
field dag OutsDPP8 = getOutsDPP<HasDst, DstVT, DstRCDPP>.ret;
+ field dag OutsVOP3DPP = OutsDPP;
+ field dag OutsVOP3DPP8 = OutsDPP8;
field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret;
field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
@@ -2198,7 +2510,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
HasIntClamp, HasModifiers, HasSrc2Mods,
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64,
- NumSrcArgs, HasClamp,
+ NumSrcArgs, HasClamp, HasOpSel,
Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret;
field dag InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64,
NumSrcArgs, HasClamp, HasOMod,
@@ -2206,21 +2518,35 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
getOpSelMod<Src1VT>.ret,
getOpSelMod<Src2VT>.ret>.ret;
field dag InsDPP = !if(HasExtDPP,
- getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
- HasModifiers, Src0ModDPP, Src1ModDPP>.ret,
+ getInsDPP<DstRCDPP, Src0DPP, Src1DPP, Src2DPP, NumSrcArgs,
+ HasModifiers, Src0ModDPP, Src1ModDPP, Src2ModDPP>.ret,
(ins));
- field dag InsDPP16 = getInsDPP16<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
- HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
- field dag InsDPP8 = getInsDPP8<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, 0,
- Src0ModDPP, Src1ModDPP>.ret;
+ field dag InsDPP16 = getInsDPP16<DstRCDPP, Src0DPP, Src1DPP, Src2DPP, NumSrcArgs,
+ HasModifiers, Src0ModDPP, Src1ModDPP, Src2ModDPP>.ret;
+ field dag InsDPP8 = getInsDPP8<DstRCDPP, Src0DPP, Src1DPP, Src2DPP,
+ NumSrcArgs, HasModifiers,
+ Src0ModDPP, Src1ModDPP, Src2ModDPP>.ret;
+ field dag InsVOP3Base = getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
+ Src2VOP3DPP, NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
+ Src0ModDPP, Src1ModDPP, Src2ModVOP3DPP, HasOpSel, IsVOP3P>.ret;
+ field dag InsVOP3DPP = getInsVOP3DPP<InsVOP3Base, DstRCDPP, NumSrcArgs>.ret;
+ field dag InsVOP3DPP16 = getInsVOP3DPP16<InsVOP3Base, DstRCDPP, NumSrcArgs>.ret;
+ field dag InsVOP3DPP8 = getInsVOP3DPP8<InsVOP3Base, DstRCDPP, NumSrcArgs>.ret;
field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
DstVT>.ret;
+ field dag InsVOPDX = (ins Src0RC32:$src0X, Src1RC32:$vsrc1X);
+ // It is a slight misnomer to use the deferred f32 operand type for non-float
+ // operands, but this operand type will only be used if the other dual
+ // component is FMAAK or FMAMK
+ field dag InsVOPDXDeferred = (ins !if(!eq(Src0VT.Size, 32), VSrc_f32_Deferred, VSrc_f16_Deferred):$src0X, VGPR_32:$vsrc1X);
+ field dag InsVOPDY = (ins Src0RC32:$src0Y, Src1RC32:$vsrc1Y);
+ field dag InsVOPDYDeferred = (ins !if(!eq(Src1VT.Size, 32), VSrc_f32_Deferred, VSrc_f16_Deferred):$src0Y, VGPR_32:$vsrc1Y);
field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasIntClamp, HasModifiers, HasOMod, DstVT>.ret;
- field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp>.ret;
+ field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp, HasOpSel>.ret;
field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
HasClamp,
HasSrc0FloatMods,
@@ -2232,15 +2558,24 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
// DPP8 encoding has no fields for modifiers, and it is enforced by setting
// the asm operand name via this HasModifiers flag
field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
+ field string AsmVOP3DPPBase = getAsmVOP3DPPBase<NumSrcArgs, HasDst, HasClamp,
+ HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasSrc0FloatMods, HasSrc1FloatMods,
+ HasSrc2FloatMods, DstVT >.ret;
+ field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3DPPBase>.ret;
+ field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3DPPBase>.ret;
+ field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3DPPBase>.ret;
field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
-
+ field string AsmVOPDX = getAsmVOPDPart<NumSrcArgs, "X">.ret;
+ field string AsmVOPDY = getAsmVOPDPart<NumSrcArgs, "Y">.ret;
field string TieRegDPP = "$old";
}
-class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
+ class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
let HasExt = 0;
let HasExtDPP = 0;
+ let HasExtVOP3DPP = 0;
+ let HasExt32BitDPP = 0;
let HasExt64BitDPP = 0;
let HasExtSDWA = 0;
let HasExtSDWA9 = 0;
@@ -2249,10 +2584,10 @@ class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.ArgVT> {
let NeedPatGen = mode;
}
-
def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
+def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>;
def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
@@ -2264,6 +2599,7 @@ def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>;
+def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>;
def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>;
@@ -2274,6 +2610,10 @@ def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>;
def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>;
def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>;
+def VOP_F16_V2F16_V2F16_F16 : VOPProfile <[f16, v2f16, v2f16, f16]>;
+def VOP_I16_V2I16_V2I16_I16 : VOPProfile <[i16, v2i16, v2i16, i16]>;
+def VOP_F32_V2I16_V2I16_F32 : VOPProfile <[f32, v2i16, v2i16, f32]>;
+
def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>;
def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
@@ -2343,6 +2683,18 @@ def VOP_V4F32_V4I16_V4I16_V4F32 : VOPProfile <[v4f32, v4i16, v4i16, v4f32]>;
def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>;
def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>;
+def VOP_V4I32_I64_I64_V4I32 : VOPProfile <[v4i32, i64, i64, v4i32]>;
+def VOP_V16I32_I64_I64_V16I32 : VOPProfile <[v16i32, i64, i64, v16i32]>;
+def VOP_V4F32_V2F32_V2F32_V4F32 : VOPProfile <[v4f32, v2f32, v2f32, v4f32]>;
+def VOP_V16F32_V2F32_V2F32_V16F32 : VOPProfile <[v16f32, v2f32, v2f32, v16f32]>;
+
+def VOP_V4F32_V4F16_V8F16_I32 : VOPProfile <[v4f32, v4f16, v8f16, i32]>;
+def VOP_V16F32_V4F16_V8F16_I32 : VOPProfile <[v16f32, v4f16, v8f16, i32]>;
+def VOP_V4F32_V4I16_V8I16_I32 : VOPProfile <[v4f32, v4i16, v8i16, i32]>;
+def VOP_V16F32_V4I16_V8I16_I32 : VOPProfile <[v16f32, v4i16, v8i16, i32]>;
+def VOP_V4I32_V2I32_V4I32_I32 : VOPProfile <[v4i32, v2i32, v4i32, i32]>;
+def VOP_V16I32_V2I32_V4I32_I32 : VOPProfile <[v16i32, v2i32, v4i32, i32]>;
+
class Commutable_REV <string revOp, bit isOrig> {
string RevOp = revOp;
bit IsOrig = isOrig;
@@ -2394,10 +2746,11 @@ multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm,
def _vi : VINTRP_Real_vi <op, NAME, outs, ins, asm>;
- let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
def _gfx10 : VINTRP_Real_si<op, NAME, outs, ins, asm, SIEncodingFamily.GFX10>;
- } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+ } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
}
+
//===----------------------------------------------------------------------===//
// Vector instruction mappings
//===----------------------------------------------------------------------===//
@@ -2470,6 +2823,7 @@ def getMCOpcodeGen : InstrMapping {
let RowFields = ["PseudoInstr"];
let ColFields = ["Subtarget"];
let KeyCol = [!cast<string>(SIEncodingFamily.NONE)];
+ // These columns must be kept in sync with the SIEncodingFamily enumeration.
let ValueCols = [[!cast<string>(SIEncodingFamily.SI)],
[!cast<string>(SIEncodingFamily.VI)],
[!cast<string>(SIEncodingFamily.SDWA)],
@@ -2482,7 +2836,9 @@ def getMCOpcodeGen : InstrMapping {
[!cast<string>(SIEncodingFamily.GFX9)],
[!cast<string>(SIEncodingFamily.GFX10)],
[!cast<string>(SIEncodingFamily.SDWA10)],
- [!cast<string>(SIEncodingFamily.GFX90A)]];
+ [!cast<string>(SIEncodingFamily.GFX90A)],
+ [!cast<string>(SIEncodingFamily.GFX940)],
+ [!cast<string>(SIEncodingFamily.GFX11)]];
}
// Get equivalent SOPK instruction.
@@ -2510,14 +2866,6 @@ def getIfAddr64Inst : InstrMapping {
let ValueCols = [["1"]];
}
-def getMUBUFNoLdsInst : InstrMapping {
- let FilterClass = "MUBUFLdsTable";
- let RowFields = ["OpName"];
- let ColFields = ["IsLds"];
- let KeyCol = ["1"];
- let ValueCols = [["0"]];
-}
-
// Maps an atomic opcode to its returnless version.
def getAtomicNoRetOp : InstrMapping {
let FilterClass = "AtomicNoRet";
@@ -2580,6 +2928,14 @@ def getFlatScratchInstSSfromSV : InstrMapping {
let ValueCols = [["SS"]];
}
+def getFlatScratchInstSVfromSVS : InstrMapping {
+ let FilterClass = "FlatScratchInst";
+ let RowFields = ["SVOp"];
+ let ColFields = ["Mode"];
+ let KeyCol = ["SVS"];
+ let ValueCols = [["SV"]];
+}
+
def getFlatScratchInstSVfromSS : InstrMapping {
let FilterClass = "FlatScratchInst";
let RowFields = ["SVOp"];
@@ -2596,6 +2952,15 @@ def getMFMAEarlyClobberOp : InstrMapping {
let ValueCols = [["0"]];
}
+// Maps an v_cmp instruction to its v_cmpx equivalent.
+def getVCMPXOpFromVCMP : InstrMapping {
+ let FilterClass = "VCMPVCMPXTable";
+ let RowFields = ["VCMPOp"];
+ let ColFields = ["IsVCMPX"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
include "SIInstructions.td"
include "DSInstructions.td"
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 7be63ae6964b..829669157893 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -14,12 +14,24 @@ class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateContro
}
+class UniformSextInreg<ValueType VT> : PatFrag<
+ (ops node:$src),
+ (sext_inreg $src, VT),
+ [{ return !N->isDivergent(); }]>;
+
+class DivergentSextInreg<ValueType VT> : PatFrag<
+ (ops node:$src),
+ (sext_inreg $src, VT),
+ [{ return N->isDivergent(); }]>;
+
include "SOPInstructions.td"
include "VOPInstructions.td"
include "SMInstructions.td"
include "FLATInstructions.td"
include "BUFInstructions.td"
include "EXPInstructions.td"
+include "LDSDIRInstructions.td"
+include "VINTERPInstructions.td"
//===----------------------------------------------------------------------===//
// VINTRP Instructions
@@ -176,19 +188,33 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
let mayStore = 0;
}
+// Pseudo instructions used for @llvm.fptrunc.round upward
+// and @llvm.fptrunc.round downward.
+// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD
+// and G_FPTRUNC_ROUND_DOWNWARD before being lowered to
+// FPTRUNC_UPWARD_PSEUDO and FPTRUNC_DOWNWARD_PSEUDO.
+// The final codegen is done in the ModeRegister pass.
+let Uses = [MODE, EXEC] in {
+def FPTRUNC_UPWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
+ (ins VGPR_32:$src0),
+ [(set f16:$vdst, (SIfptrunc_round_upward f32:$src0))]>;
+
+def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
+ (ins VGPR_32:$src0),
+ [(set f16:$vdst, (SIfptrunc_round_downward f32:$src0))]>;
+} // End Uses = [MODE, EXEC]
+
// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
// restoring it after we're done.
let Defs = [SCC] in {
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
- (ins VGPR_32: $src, VSrc_b32:$inactive),
+ (ins VSrc_b32: $src, VSrc_b32:$inactive),
[(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
- let Constraints = "$src = $vdst";
}
def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
- (ins VReg_64: $src, VSrc_b64:$inactive),
+ (ins VSrc_b64: $src, VSrc_b64:$inactive),
[(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
- let Constraints = "$src = $vdst";
}
} // End Defs = [SCC]
@@ -287,6 +313,20 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
let isConvergent = 1;
let FixedSize = 1;
let Size = 0;
+ let isMeta = 1;
+}
+
+def SCHED_BARRIER : SPseudoInstSI<(outs), (ins i32imm:$mask),
+ [(int_amdgcn_sched_barrier (i32 timm:$mask))]> {
+ let SchedRW = [];
+ let hasNoSchedulingInfo = 1;
+ let hasSideEffects = 1;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let isConvergent = 1;
+ let FixedSize = 1;
+ let Size = 0;
+ let isMeta = 1;
}
// SI pseudo instructions. These are used by the CFG structurizer pass
@@ -424,6 +464,7 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
let Size = 0;
let hasNoSchedulingInfo = 1;
let FixedSize = 1;
+ let isMeta = 1;
}
// Used as an isel pseudo to directly emit initialization with an
@@ -459,11 +500,14 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI <
let hasNoSchedulingInfo = 1;
let DisableWQM = 1;
let FixedSize = 1;
+
+ // TODO: Should this be true?
+ let isMeta = 0;
}
// Return for returning function calls.
def SI_RETURN : SPseudoInstSI <
- (outs), (ins), [],
+ (outs), (ins), [(AMDGPUret_flag)],
"; return"> {
let isTerminator = 1;
let isBarrier = 1;
@@ -496,6 +540,7 @@ def : GCNPat<
def SI_CALL : SPseudoInstSI <
(outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> {
let Size = 4;
+ let FixedSize = 1;
let isCall = 1;
let UseNamedOperandTable = 1;
let SchedRW = [WriteBranch];
@@ -508,6 +553,7 @@ def SI_TCRETURN : SPseudoInstSI <(outs),
(ins SReg_64:$src0, unknown:$callee, i32imm:$fpdiff),
[(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
let Size = 4;
+ let FixedSize = 1;
let isCall = 1;
let isTerminator = 1;
let isReturn = 1;
@@ -1212,6 +1258,26 @@ def : Pat <
(v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3))
>;
+def : Pat <
+ (extract_subvector v16i16:$vec, (i32 0)),
+ (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub0_sub1_sub2_sub3))
+>;
+
+def : Pat <
+ (extract_subvector v16i16:$vec, (i32 8)),
+ (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub4_sub5_sub6_sub7))
+>;
+
+def : Pat <
+ (extract_subvector v16f16:$vec, (i32 0)),
+ (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub0_sub1_sub2_sub3))
+>;
+
+def : Pat <
+ (extract_subvector v16f16:$vec, (i32 8)),
+ (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub4_sub5_sub6_sub7))
+>;
+
foreach Index = 0-31 in {
def Extract_Element_v32i32_#Index : Extract_Element <
i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -1371,7 +1437,18 @@ def : BitConvert <v8i32, v4i64, VReg_256>;
def : BitConvert <v8f32, v4i64, VReg_256>;
def : BitConvert <v8i32, v4f64, VReg_256>;
def : BitConvert <v8f32, v4f64, VReg_256>;
-
+def : BitConvert <v16i16, v16f16, SReg_256>;
+def : BitConvert <v16f16, v16i16, SReg_256>;
+def : BitConvert <v16i16, v16f16, VReg_256>;
+def : BitConvert <v16f16, v16i16, VReg_256>;
+def : BitConvert <v16f16, v8i32, VReg_256>;
+def : BitConvert <v16i16, v8i32, VReg_256>;
+def : BitConvert <v16f16, v8f32, VReg_256>;
+def : BitConvert <v16i16, v8f32, VReg_256>;
+def : BitConvert <v8i32, v16f16, VReg_256>;
+def : BitConvert <v8i32, v16i16, VReg_256>;
+def : BitConvert <v8f32, v16f16, VReg_256>;
+def : BitConvert <v8f32, v16i16, VReg_256>;
// 512-bit bitcast
def : BitConvert <v16i32, v16f32, VReg_512>;
@@ -1941,12 +2018,6 @@ def : GCNPat <
//===----------------------------------------------------------------------===//
// Conversion Patterns
//===----------------------------------------------------------------------===//
-
-class UniformSextInreg<ValueType VT> : PatFrag<
- (ops node:$src),
- (sext_inreg $src, VT),
- [{ return !N->isDivergent(); }]>;
-
def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)),
(S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
@@ -1981,23 +2052,28 @@ def : GCNPat <
(S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
>;
-
-class DivergentSextInreg<ValueType VT> : PatFrag<
- (ops node:$src),
- (sext_inreg $src, VT),
- [{ return N->isDivergent(); }]>;
-
-def : GCNPat<(i32 (DivergentSextInreg<i1> i32:$src)),
+def : GCNPat<
+ (i32 (DivergentSextInreg<i1> i32:$src)),
(V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>;
def : GCNPat <
(i16 (DivergentSextInreg<i1> i16:$src)),
- (V_BFE_I32_e64 $src, (i32 0), (i32 1)) // 0 | 1 << 16
+ (V_BFE_I32_e64 $src, (i32 0), (i32 1))
>;
def : GCNPat <
(i16 (DivergentSextInreg<i8> i16:$src)),
- (V_BFE_I32_e64 $src, (i32 0), (i32 8)) // 0 | 8 << 16
+ (V_BFE_I32_e64 $src, (i32 0), (i32 8))
+>;
+
+def : GCNPat<
+ (i32 (DivergentSextInreg<i8> i32:$src)),
+ (V_BFE_I32_e64 i32:$src, (i32 0), (i32 8))
+>;
+
+def : GCNPat <
+ (i32 (DivergentSextInreg<i16> i32:$src)),
+ (V_BFE_I32_e64 $src, (i32 0), (i32 16))
>;
def : GCNPat <
@@ -2010,14 +2086,14 @@ def : GCNPat <
def : GCNPat <
(i64 (DivergentSextInreg<i8> i64:$src)),
(REG_SEQUENCE VReg_64,
- (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)/* 0 | 8 << 16 */), sub0,
+ (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)), sub0,
(V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1)
>;
def : GCNPat <
(i64 (DivergentSextInreg<i16> i64:$src)),
(REG_SEQUENCE VReg_64,
- (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)/* 0 | 16 << 16 */), sub0,
+ (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)), sub0,
(V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1)
>;
@@ -2053,12 +2129,18 @@ def : ZExt_i64_i1_Pat<anyext>;
// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
// REG_SEQUENCE patterns don't support instructions with multiple outputs.
def : GCNPat <
- (i64 (sext i32:$src)),
+ (i64 (UniformUnaryFrag<sext> i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0,
(i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
>;
def : GCNPat <
+ (i64 (DivergentUnaryFrag<sext> i32:$src)),
+ (REG_SEQUENCE VReg_64, $src, sub0,
+ (i32 (COPY_TO_REGCLASS (V_ASHRREV_I32_e64 (i32 31), $src), VGPR_32)), sub1)
+>;
+
+def : GCNPat <
(i64 (sext i1:$src)),
(REG_SEQUENCE VReg_64,
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
@@ -2235,6 +2317,30 @@ def : GCNPat <
// the src is lowered. e.g. fptrunc + fma may be lowered to a
// v_fma_mix* instruction which does not zero, or may not.
def : GCNPat<
+ (i32 (DivergentUnaryFrag<abs> i32:$src)),
+ (V_MAX_I32_e64 (V_SUB_CO_U32_e32 (i32 0), $src), $src)>;
+
+let AddedComplexity = 1 in {
+def : GCNPat<
+ (i32 (DivergentUnaryFrag<abs> i32:$src)),
+ (V_MAX_I32_e64 (V_SUB_U32_e32 (i32 0), $src), $src)>{
+ let SubtargetPredicate = HasAddNoCarryInsts;
+}
+} // AddedComplexity = 1
+
+def : GCNPat<
+ (i32 (DivergentUnaryFrag<zext> i16:$src)),
+ (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src)
+>;
+
+def : GCNPat<
+ (i64 (DivergentUnaryFrag<zext> i16:$src)),
+ (REG_SEQUENCE VReg_64,
+ (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src), sub0,
+ (S_MOV_B32 (i32 0)), sub1)
+>;
+
+def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(COPY VSrc_b16:$src)>;
@@ -2269,6 +2375,34 @@ def : GCNPat <
(V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
>;
+def IMMBitSelConst : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N),
+ MVT::i32);
+}]>;
+
+// Matching separate SRL and TRUNC instructions
+// with dependent operands (SRL dest is source of TRUNC)
+// generates three instructions. However, by using bit shifts,
+// the V_LSHRREV_B32_e64 result can be directly used in the
+// operand of the V_AND_B32_e64 instruction:
+// (trunc i32 (srl i32 $a, i32 $b)) ->
+// v_and_b32_e64 $a, (1 << $b), $a
+// v_cmp_ne_u32_e64 $a, 0, $a
+
+// Handle the VALU case.
+def : GCNPat <
+ (i1 (DivergentUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))),
+ (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a),
+ (i32 0))
+>;
+
+// Handle the scalar case.
+def : GCNPat <
+ (i1 (UniformUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))),
+ (S_CMP_LG_U32 (S_AND_B32 (i32 (IMMBitSelConst $b)), $a),
+ (i32 0))
+>;
+
def : GCNPat <
(i1 (DivergentUnaryFrag<trunc> i64:$a)),
(V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1),
@@ -2350,6 +2484,11 @@ def : GCNPat <
}
+def : GCNPat<
+ (i64 (DivergentUnaryFrag<bitreverse> i64:$a)),
+ (REG_SEQUENCE VReg_64,
+ (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0,
+ (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>;
// Prefer selecting to max when legal, but using mul is always valid.
let AddedComplexity = -5 in {
@@ -2508,12 +2647,12 @@ def : GCNPat <
>;
def : GCNPat <
- (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))),
+ (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 undef))),
(COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
>;
def : GCNPat <
- (v2i16 (build_vector (i16 VGPR_32:$src0), (i16 undef))),
+ (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 undef))),
(COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
>;
@@ -2597,6 +2736,15 @@ def : GCNPat <
>;
} // End SubtargetPredicate = HasVOP3PInsts
+// With multiple uses of the shift, this will duplicate the shift and
+// increase register pressure.
+let SubtargetPredicate = isGFX11Plus in
+def : GCNPat <
+ (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), (i16 SReg_32:$src1))),
+ (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1))
+>;
+
+
def : GCNPat <
(v2f16 (scalar_to_vector f16:$src0)),
(COPY $src0)
@@ -2678,18 +2826,18 @@ def : GCNPat <
// an inline immediate than -c.
// TODO: Also do for 64-bit.
def : GCNPat<
- (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+ (UniformBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
(S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1)
>;
def : GCNPat<
- (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+ (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
(V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
let SubtargetPredicate = HasAddNoCarryInsts;
}
def : GCNPat<
- (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+ (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
(V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
let SubtargetPredicate = NotHasAddNoCarryInsts;
}
@@ -2703,20 +2851,21 @@ def : GCNPat<
(S_MOV_B32 SReg_32:$src)
>;
-multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
+multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> {
def : GCNPat <
- (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
+ (vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
(BFM $a, $b)
>;
def : GCNPat <
- (vt (add (vt (shl 1, vt:$a)), -1)),
- (BFM $a, (MOV (i32 0)))
+ (vt (ADD (vt (shl 1, vt:$a)), -1)),
+ (BFM $a, (i32 0))
>;
}
-defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
-// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
+defm : BFMPatterns <i32, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B32>;
+// FIXME: defm : BFMPatterns <i64, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B64>;
+defm : BFMPatterns <i32, DivergentBinFrag<shl>, DivergentBinFrag<add>, V_BFM_B32_e64>;
// Bitfield extract patterns
@@ -3007,6 +3156,19 @@ def G_AMDGPU_CLAMP : AMDGPUGenericInstruction {
let hasSideEffects = 0;
}
+// Integer multiply-add: arg0 * arg1 + arg2.
+//
+// arg0 and arg1 are 32-bit integers (interpreted as signed or unsigned),
+// arg2 is a 64-bit integer. Result is a 64-bit integer and a 1-bit carry-out.
+class G_AMDGPU_MAD_64_32 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst, type1:$carry_out);
+ let InOperandList = (ins type2:$arg0, type2:$arg1, type0:$arg2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_MAD_U64_U32 : G_AMDGPU_MAD_64_32;
+def G_AMDGPU_MAD_I64_I32 : G_AMDGPU_MAD_64_32;
+
// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
// operand Expects a MachineMemOperand in addition to explicit
// operands.
@@ -3130,3 +3292,15 @@ def G_SI_CALL : AMDGPUGenericInstruction {
// TODO: Should really base this on the call target
let isConvergent = 1;
}
+
+def G_FPTRUNC_ROUND_UPWARD : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$vdst);
+ let InOperandList = (ins type1:$src0);
+ let hasSideEffects = 0;
+}
+
+def G_FPTRUNC_ROUND_DOWNWARD : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$vdst);
+ let InOperandList = (ins type1:$src0);
+ let hasSideEffects = 0;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index 4fa8ec711134..47095ae22027 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -72,16 +72,22 @@ static void generateEndPgm(MachineBasicBlock &MBB,
bool IsPS = F.getCallingConv() == CallingConv::AMDGPU_PS;
// Check if hardware has been configured to expect color or depth exports.
- bool HasExports =
- AMDGPU::getHasColorExport(F) || AMDGPU::getHasDepthExport(F);
+ bool HasColorExports = AMDGPU::getHasColorExport(F);
+ bool HasDepthExports = AMDGPU::getHasDepthExport(F);
+ bool HasExports = HasColorExports || HasDepthExports;
// Prior to GFX10, hardware always expects at least one export for PS.
bool MustExport = !AMDGPU::isGFX10Plus(TII->getSubtarget());
if (IsPS && (HasExports || MustExport)) {
// Generate "null export" if hardware is expecting PS to export.
+ const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ int Target =
+ ST.hasNullExportTarget()
+ ? AMDGPU::Exp::ET_NULL
+ : (HasColorExports ? AMDGPU::Exp::ET_MRT0 : AMDGPU::Exp::ET_MRTZ);
BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
- .addImm(AMDGPU::Exp::ET_NULL)
+ .addImm(Target)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 44bdbe37dec0..6d4e1d2c898b 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -79,6 +79,13 @@ enum InstClassEnum {
MIMG,
TBUFFER_LOAD,
TBUFFER_STORE,
+ GLOBAL_LOAD_SADDR,
+ GLOBAL_STORE_SADDR,
+ FLAT_LOAD,
+ FLAT_STORE,
+ GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
+ GLOBAL_STORE // any CombineInfo, they are only ever returned by
+ // getCommonInstClass.
};
struct AddressRegs {
@@ -86,6 +93,7 @@ struct AddressRegs {
bool SBase = false;
bool SRsrc = false;
bool SOffset = false;
+ bool SAddr = false;
bool VAddr = false;
bool Addr = false;
bool SSamp = false;
@@ -160,6 +168,11 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
}
void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
+
+ // Compare by pointer order.
+ bool operator<(const CombineInfo& Other) const {
+ return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
+ }
};
struct BaseRegisters {
@@ -185,6 +198,9 @@ private:
AliasAnalysis *AA = nullptr;
bool OptimizeAgain;
+ bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
+ const DenseSet<Register> &ARegUses,
+ const MachineInstr &A, const MachineInstr &B) const;
static bool dmasksCanBeCombined(const CombineInfo &CI,
const SIInstrInfo &TII,
const CombineInfo &Paired);
@@ -199,38 +215,43 @@ private:
const CombineInfo &Paired);
const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
- bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
- SmallVectorImpl<MachineInstr *> &InstsToMove);
+ CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
unsigned read2Opcode(unsigned EltSize) const;
unsigned read2ST64Opcode(unsigned EltSize) const;
- MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
- CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator
+ mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore);
unsigned write2Opcode(unsigned EltSize) const;
unsigned write2ST64Opcode(unsigned EltSize) const;
MachineBasicBlock::iterator
mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
MachineBasicBlock::iterator
mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
MachineBasicBlock::iterator
mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
MachineBasicBlock::iterator
mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
MachineBasicBlock::iterator
mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
MachineBasicBlock::iterator
mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
MachineBasicBlock::iterator
mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
+ MachineBasicBlock::iterator
+ mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore);
+ MachineBasicBlock::iterator
+ mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore);
void updateBaseAndOffset(MachineInstr &I, Register NewBase,
int32_t NewOffset) const;
@@ -252,6 +273,12 @@ private:
MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
std::list<std::list<CombineInfo>> &MergeableInsts) const;
+ static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
+ const CombineInfo &Paired);
+
+ static InstClassEnum getCommonInstClass(const CombineInfo &CI,
+ const CombineInfo &Paired);
+
public:
static char ID;
@@ -298,10 +325,35 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
switch (Opc) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ case AMDGPU::GLOBAL_LOAD_DWORD:
+ case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORD:
+ case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORD:
+ case AMDGPU::FLAT_STORE_DWORD:
return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX2:
+ case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX2:
+ case AMDGPU::FLAT_STORE_DWORDX2:
return 2;
+ case AMDGPU::GLOBAL_LOAD_DWORDX3:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX3:
+ case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX3:
+ case AMDGPU::FLAT_STORE_DWORDX3:
+ return 3;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX4:
+ case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX4:
+ case AMDGPU::FLAT_STORE_DWORDX4:
return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return 8;
@@ -386,11 +438,40 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::DS_WRITE_B64:
case AMDGPU::DS_WRITE_B64_gfx9:
return DS_WRITE;
+ case AMDGPU::GLOBAL_LOAD_DWORD:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4:
+ case AMDGPU::FLAT_LOAD_DWORD:
+ case AMDGPU::FLAT_LOAD_DWORDX2:
+ case AMDGPU::FLAT_LOAD_DWORDX3:
+ case AMDGPU::FLAT_LOAD_DWORDX4:
+ return FLAT_LOAD;
+ case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+ return GLOBAL_LOAD_SADDR;
+ case AMDGPU::GLOBAL_STORE_DWORD:
+ case AMDGPU::GLOBAL_STORE_DWORDX2:
+ case AMDGPU::GLOBAL_STORE_DWORDX3:
+ case AMDGPU::GLOBAL_STORE_DWORDX4:
+ case AMDGPU::FLAT_STORE_DWORD:
+ case AMDGPU::FLAT_STORE_DWORDX2:
+ case AMDGPU::FLAT_STORE_DWORDX3:
+ case AMDGPU::FLAT_STORE_DWORDX4:
+ return FLAT_STORE;
+ case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+ return GLOBAL_STORE_SADDR;
}
}
/// Determines instruction subclass from opcode. Only instructions
-/// of the same subclass can be merged together.
+/// of the same subclass can be merged together. The merged instruction may have
+/// a different subclass but must have the same class.
static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
switch (Opc) {
default:
@@ -418,9 +499,55 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
+ case AMDGPU::GLOBAL_LOAD_DWORD:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4:
+ case AMDGPU::FLAT_LOAD_DWORD:
+ case AMDGPU::FLAT_LOAD_DWORDX2:
+ case AMDGPU::FLAT_LOAD_DWORDX3:
+ case AMDGPU::FLAT_LOAD_DWORDX4:
+ return AMDGPU::FLAT_LOAD_DWORD;
+ case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+ return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
+ case AMDGPU::GLOBAL_STORE_DWORD:
+ case AMDGPU::GLOBAL_STORE_DWORDX2:
+ case AMDGPU::GLOBAL_STORE_DWORDX3:
+ case AMDGPU::GLOBAL_STORE_DWORDX4:
+ case AMDGPU::FLAT_STORE_DWORD:
+ case AMDGPU::FLAT_STORE_DWORDX2:
+ case AMDGPU::FLAT_STORE_DWORDX3:
+ case AMDGPU::FLAT_STORE_DWORDX4:
+ return AMDGPU::FLAT_STORE_DWORD;
+ case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+ return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
}
}
+// GLOBAL loads and stores are classified as FLAT initially. If both combined
+// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
+// If either or both instructions are non segment specific FLAT the resulting
+// combined operation will be FLAT, potentially promoting one of the GLOBAL
+// operations to FLAT.
+// For other instructions return the original unmodified class.
+InstClassEnum
+SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
+ const CombineInfo &Paired) {
+ assert(CI.InstClass == Paired.InstClass);
+
+ if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
+ SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
+ return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
+
+ return CI.InstClass;
+}
+
static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
AddressRegs Result;
@@ -480,6 +607,34 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::DS_WRITE_B64_gfx9:
Result.Addr = true;
return Result;
+ case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+ Result.SAddr = true;
+ LLVM_FALLTHROUGH;
+ case AMDGPU::GLOBAL_LOAD_DWORD:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4:
+ case AMDGPU::GLOBAL_STORE_DWORD:
+ case AMDGPU::GLOBAL_STORE_DWORDX2:
+ case AMDGPU::GLOBAL_STORE_DWORDX3:
+ case AMDGPU::GLOBAL_STORE_DWORDX4:
+ case AMDGPU::FLAT_LOAD_DWORD:
+ case AMDGPU::FLAT_LOAD_DWORDX2:
+ case AMDGPU::FLAT_LOAD_DWORDX3:
+ case AMDGPU::FLAT_LOAD_DWORDX4:
+ case AMDGPU::FLAT_STORE_DWORD:
+ case AMDGPU::FLAT_STORE_DWORDX2:
+ case AMDGPU::FLAT_STORE_DWORDX3:
+ case AMDGPU::FLAT_STORE_DWORDX4:
+ Result.VAddr = true;
+ return Result;
}
}
@@ -551,6 +706,9 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
if (Regs.SOffset)
AddrIdx[NumAddresses++] =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
+ if (Regs.SAddr)
+ AddrIdx[NumAddresses++] =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
if (Regs.VAddr)
AddrIdx[NumAddresses++] =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
@@ -579,92 +737,58 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() {
return new SILoadStoreOptimizer();
}
-static void moveInstsAfter(MachineBasicBlock::iterator I,
- ArrayRef<MachineInstr *> InstsToMove) {
- MachineBasicBlock *MBB = I->getParent();
- ++I;
- for (MachineInstr *MI : InstsToMove) {
- MI->removeFromParent();
- MBB->insert(I, MI);
- }
-}
-
static void addDefsUsesToList(const MachineInstr &MI,
DenseSet<Register> &RegDefs,
- DenseSet<Register> &PhysRegUses) {
- for (const MachineOperand &Op : MI.operands()) {
- if (Op.isReg()) {
- if (Op.isDef())
- RegDefs.insert(Op.getReg());
- else if (Op.readsReg() && Op.getReg().isPhysical())
- PhysRegUses.insert(Op.getReg());
- }
+ DenseSet<Register> &RegUses) {
+ for (const auto &Op : MI.operands()) {
+ if (!Op.isReg())
+ continue;
+ if (Op.isDef())
+ RegDefs.insert(Op.getReg());
+ if (Op.readsReg())
+ RegUses.insert(Op.getReg());
}
}
-static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
- MachineBasicBlock::iterator B,
- AliasAnalysis *AA) {
- // RAW or WAR - cannot reorder
- // WAW - cannot reorder
- // RAR - safe to reorder
- return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
-}
-
-// Add MI and its defs to the lists if MI reads one of the defs that are
-// already in the list. Returns true in that case.
-static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
- DenseSet<Register> &PhysRegUses,
- SmallVectorImpl<MachineInstr *> &Insts) {
- for (MachineOperand &Use : MI.operands()) {
- // If one of the defs is read, then there is a use of Def between I and the
- // instruction that I will potentially be merged with. We will need to move
- // this instruction after the merged instructions.
- //
- // Similarly, if there is a def which is read by an instruction that is to
- // be moved for merging, then we need to move the def-instruction as well.
- // This can only happen for physical registers such as M0; virtual
- // registers are in SSA form.
- if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
- (Use.isDef() && RegDefs.count(Use.getReg())) ||
- (Use.isDef() && Use.getReg().isPhysical() &&
- PhysRegUses.count(Use.getReg())))) {
- Insts.push_back(&MI);
- addDefsUsesToList(MI, RegDefs, PhysRegUses);
- return true;
- }
- }
-
- return false;
-}
-
-static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
- ArrayRef<MachineInstr *> InstsToMove,
- AliasAnalysis *AA) {
- assert(MemOp.mayLoadOrStore());
-
- for (MachineInstr *InstToMove : InstsToMove) {
- if (!InstToMove->mayLoadOrStore())
+bool SILoadStoreOptimizer::canSwapInstructions(
+ const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
+ const MachineInstr &A, const MachineInstr &B) const {
+ if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
+ (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
+ return false;
+ for (const auto &BOp : B.operands()) {
+ if (!BOp.isReg())
continue;
- if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
+ if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
+ return false;
+ if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
return false;
}
return true;
}
-// This function assumes that \p A and \p B have are identical except for
-// size and offset, and they reference adjacent memory.
-static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
- const MachineMemOperand *A,
- const MachineMemOperand *B) {
- unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
- unsigned Size = A->getSize() + B->getSize();
- // This function adds the offset parameter to the existing offset for A,
- // so we pass 0 here as the offset and then manually set it to the correct
- // value after the call.
- MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
- MMO->setOffset(MinOffset);
- return MMO;
+// Given that \p CI and \p Paired are adjacent memory operations produce a new
+// MMO for the combined operation with a new access size.
+MachineMemOperand *
+SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
+ const CombineInfo &Paired) {
+ const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+ const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
+
+ unsigned Size = MMOa->getSize() + MMOb->getSize();
+
+ // A base pointer for the combined operation is the same as the leading
+ // operation's pointer.
+ if (Paired < CI)
+ std::swap(MMOa, MMOb);
+
+ MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
+ // If merging FLAT and GLOBAL set address space to FLAT.
+ if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
+ PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
+
+ MachineFunction *MF = CI.I->getMF();
+ return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
}
bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
@@ -787,8 +911,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
return (EltOffset0 + CI.Width == EltOffset1 ||
EltOffset1 + Paired.Width == EltOffset0) &&
- CI.CPol == Paired.CPol &&
- (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
+ CI.CPol == Paired.CPol;
}
// If the offset in elements doesn't fit in 8-bits, we might be able to use
@@ -889,111 +1012,59 @@ SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
return nullptr;
}
-/// This function assumes that CI comes before Paired in a basic block.
-bool SILoadStoreOptimizer::checkAndPrepareMerge(
- CombineInfo &CI, CombineInfo &Paired,
- SmallVectorImpl<MachineInstr *> &InstsToMove) {
+/// This function assumes that CI comes before Paired in a basic block. Return
+/// an insertion point for the merged instruction or nullptr on failure.
+SILoadStoreOptimizer::CombineInfo *
+SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
+ CombineInfo &Paired) {
+ // If another instruction has already been merged into CI, it may now be a
+ // type that we can't do any further merging into.
+ if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
+ return nullptr;
+ assert(CI.InstClass == Paired.InstClass);
+
+ if (getInstSubclass(CI.I->getOpcode(), *TII) !=
+ getInstSubclass(Paired.I->getOpcode(), *TII))
+ return nullptr;
// Check both offsets (or masks for MIMG) can be combined and fit in the
// reduced range.
- if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
- return false;
-
- if (CI.InstClass != MIMG &&
- (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
- return false;
-
- const unsigned Opc = CI.I->getOpcode();
- const InstClassEnum InstClass = getInstClass(Opc, *TII);
-
- if (InstClass == UNKNOWN) {
- return false;
+ if (CI.InstClass == MIMG) {
+ if (!dmasksCanBeCombined(CI, *TII, Paired))
+ return nullptr;
+ } else {
+ if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
+ return nullptr;
}
- const unsigned InstSubclass = getInstSubclass(Opc, *TII);
-
- DenseSet<Register> RegDefsToMove;
- DenseSet<Register> PhysRegUsesToMove;
- addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
-
- MachineBasicBlock::iterator E = std::next(Paired.I);
- MachineBasicBlock::iterator MBBI = std::next(CI.I);
- MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
- for (; MBBI != E; ++MBBI) {
-
- if (MBBI == MBBE) {
- // CombineInfo::Order is a hint on the instruction ordering within the
- // basic block. This hint suggests that CI precedes Paired, which is
- // true most of the time. However, moveInstsAfter() processing a
- // previous list may have changed this order in a situation when it
- // moves an instruction which exists in some other merge list.
- // In this case it must be dependent.
- return false;
- }
- if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
- (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
- // This is not a matching instruction, but we can keep looking as
- // long as one of these conditions are met:
- // 1. It is safe to move I down past MBBI.
- // 2. It is safe to move MBBI down past the instruction that I will
- // be merged into.
-
- if (MBBI->mayLoadOrStore() &&
- (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
- // We fail condition #1, but we may still be able to satisfy condition
- // #2. Add this instruction to the move list and then we will check
- // if condition #2 holds once we have selected the matching instruction.
- InstsToMove.push_back(&*MBBI);
- addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
- continue;
- }
-
- // When we match I with another DS instruction we will be moving I down
- // to the location of the matched instruction any uses of I will need to
- // be moved down as well.
- addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
- InstsToMove);
- continue;
+ DenseSet<Register> RegDefs;
+ DenseSet<Register> RegUses;
+ CombineInfo *Where;
+ if (CI.I->mayLoad()) {
+ // Try to hoist Paired up to CI.
+ addDefsUsesToList(*Paired.I, RegDefs, RegUses);
+ for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
+ if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
+ return nullptr;
}
-
- // Handle a case like
- // DS_WRITE_B32 addr, v, idx0
- // w = DS_READ_B32 addr, idx0
- // DS_WRITE_B32 addr, f(w), idx1
- // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
- // merging of the two writes.
- if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
- InstsToMove))
- continue;
-
- if (&*MBBI == &*Paired.I) {
- // We need to go through the list of instructions that we plan to
- // move and make sure they are all safe to move down past the merged
- // instruction.
- if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
-
- // Call offsetsCanBeCombined with modify = true so that the offsets are
- // correct for the new instruction. This should return true, because
- // this function should only be called on CombineInfo objects that
- // have already been confirmed to be mergeable.
- if (CI.InstClass != MIMG)
- offsetsCanBeCombined(CI, *STM, Paired, true);
- return true;
- }
- return false;
+ Where = &CI;
+ } else {
+ // Try to sink CI down to Paired.
+ addDefsUsesToList(*CI.I, RegDefs, RegUses);
+ for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
+ if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
+ return nullptr;
}
-
- // We've found a load/store that we couldn't merge for some reason.
- // We could potentially keep looking, but we'd need to make sure that
- // it was safe to move I and also all the instruction in InstsToMove
- // down past this instruction.
- // check if we can move I across MBBI and if we can move all I's users
- if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
- break;
+ Where = &Paired;
}
- return false;
+
+ // Call offsetsCanBeCombined with modify = true so that the offsets are
+ // correct for the new instruction. This should return true, because
+ // this function should only be called on CombineInfo objects that
+ // have already been confirmed to be mergeable.
+ if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
+ offsetsCanBeCombined(CI, *STM, Paired, true);
+ return Where;
}
unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
@@ -1012,7 +1083,7 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
MachineBasicBlock::iterator
SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
// Be careful, since the addresses could be subregisters themselves in weird
@@ -1051,13 +1122,13 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+ BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
.addImm(CI.BaseOff);
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
+ TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
.addReg(ImmReg)
.addReg(AddrReg->getReg(), 0, BaseSubReg)
.addImm(0); // clamp bit
@@ -1065,7 +1136,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
}
MachineInstrBuilder Read2 =
- BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
+ BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
.addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
.addImm(NewOffset0) // offset0
.addImm(NewOffset1) // offset1
@@ -1077,14 +1148,12 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
// Copy to the old destination registers.
- BuildMI(*MBB, Paired.I, DL, CopyDesc)
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
- .add(*Dest1)
- .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
- moveInstsAfter(Copy1, InstsToMove);
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1109,9 +1178,9 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
: AMDGPU::DS_WRITE2ST64_B64_gfx9;
}
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
+ CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -1145,13 +1214,13 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+ BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
.addImm(CI.BaseOff);
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
+ TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
.addReg(ImmReg)
.addReg(AddrReg->getReg(), 0, BaseSubReg)
.addImm(0); // clamp bit
@@ -1159,7 +1228,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
}
MachineInstrBuilder Write2 =
- BuildMI(*MBB, Paired.I, DL, Write2Desc)
+ BuildMI(*MBB, InsertBefore, DL, Write2Desc)
.addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
.add(*Data0) // data0
.add(*Data1) // data1
@@ -1168,8 +1237,6 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
.addImm(0) // gds
.cloneMergedMemRefs({&*CI.I, &*Paired.I});
- moveInstsAfter(Write2, InstsToMove);
-
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1179,7 +1246,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator
SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1191,7 +1258,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
unsigned DMaskIdx =
AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
- auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
if (I == DMaskIdx)
MIB.addImm(MergedDMask);
@@ -1204,10 +1271,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
- const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
- MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+ MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
unsigned SubRegIdx0, SubRegIdx1;
std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
@@ -1217,14 +1281,12 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, Paired.I, DL, CopyDesc)
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
- .add(*Dest1)
- .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
- moveInstsAfter(Copy1, InstsToMove);
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1233,7 +1295,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1248,15 +1310,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
- const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
MachineInstr *New =
- BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
- .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
- .addImm(MergedOffset) // offset
- .addImm(CI.CPol) // cpol
- .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+ BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+ .addImm(MergedOffset) // offset
+ .addImm(CI.CPol) // cpol
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1267,14 +1326,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
- BuildMI(*MBB, Paired.I, DL, CopyDesc)
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
- .add(*Dest1)
- .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
- moveInstsAfter(Copy1, InstsToMove);
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1283,7 +1340,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
@@ -1295,7 +1352,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
Register DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
- auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1307,9 +1364,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
- const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
MachineInstr *New =
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1317,7 +1371,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
.addImm(CI.CPol) // cpol
.addImm(0) // tfe
.addImm(0) // swz
- .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1328,14 +1382,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, Paired.I, DL, CopyDesc)
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
- .add(*Dest1)
- .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
- moveInstsAfter(Copy1, InstsToMove);
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1344,7 +1396,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
@@ -1356,7 +1408,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
Register DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
- auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1371,9 +1423,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
- const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
MachineInstr *New =
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1382,8 +1431,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
.addImm(CI.CPol) // cpol
.addImm(0) // tfe
.addImm(0) // swz
- .addMemOperand(
- combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1394,14 +1442,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, Paired.I, DL, CopyDesc)
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
- .add(*Dest1)
- .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
- moveInstsAfter(Copy1, InstsToMove);
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1410,7 +1456,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
@@ -1427,13 +1473,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
.add(*Src0)
.addImm(SubRegIdx0)
.add(*Src1)
.addImm(SubRegIdx1);
- auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1449,9 +1495,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
- const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
MachineInstr *New =
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1460,10 +1503,92 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
.addImm(CI.CPol) // cpol
.addImm(0) // tfe
.addImm(0) // swz
- .addMemOperand(
- combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+
+ CI.I->eraseFromParent();
+ Paired.I->eraseFromParent();
+ return New;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
+ CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
- moveInstsAfter(MIB, InstsToMove);
+ const unsigned Opcode = getNewOpcode(CI, Paired);
+
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
+ Register DestReg = MRI->createVirtualRegister(SuperRC);
+
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
+
+ if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
+ MIB.add(*SAddr);
+
+ MachineInstr *New =
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
+ .addImm(std::min(CI.Offset, Paired.Offset))
+ .addImm(CI.CPol)
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
+ const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+ const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+
+ // Copy to the old destination registers.
+ const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
+ const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
+
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest0) // Copy to same destination including flags and sub reg.
+ .addReg(DestReg, 0, SubRegIdx0);
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+ CI.I->eraseFromParent();
+ Paired.I->eraseFromParent();
+ return New;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
+ CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+
+ const unsigned Opcode = getNewOpcode(CI, Paired);
+
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
+ const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+ const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+
+ // Copy to the new source register.
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
+ Register SrcReg = MRI->createVirtualRegister(SuperRC);
+
+ const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+ const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
+
+ BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ .add(*Src0)
+ .addImm(SubRegIdx0)
+ .add(*Src1)
+ .addImm(SubRegIdx1);
+
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
+ .addReg(SrcReg, RegState::Kill);
+
+ if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
+ MIB.add(*SAddr);
+
+ MachineInstr *New =
+ MIB.addImm(std::min(CI.Offset, Paired.Offset))
+ .addImm(CI.CPol)
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1474,7 +1599,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
const CombineInfo &Paired) {
const unsigned Width = CI.Width + Paired.Width;
- switch (CI.InstClass) {
+ switch (getCommonInstClass(CI, Paired)) {
default:
assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
// FIXME: Handle d16 correctly
@@ -1498,6 +1623,72 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case 8:
return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
}
+ case GLOBAL_LOAD:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::GLOBAL_LOAD_DWORDX2;
+ case 3:
+ return AMDGPU::GLOBAL_LOAD_DWORDX3;
+ case 4:
+ return AMDGPU::GLOBAL_LOAD_DWORDX4;
+ }
+ case GLOBAL_LOAD_SADDR:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
+ case 3:
+ return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
+ case 4:
+ return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
+ }
+ case GLOBAL_STORE:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::GLOBAL_STORE_DWORDX2;
+ case 3:
+ return AMDGPU::GLOBAL_STORE_DWORDX3;
+ case 4:
+ return AMDGPU::GLOBAL_STORE_DWORDX4;
+ }
+ case GLOBAL_STORE_SADDR:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
+ case 3:
+ return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
+ case 4:
+ return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
+ }
+ case FLAT_LOAD:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::FLAT_LOAD_DWORDX2;
+ case 3:
+ return AMDGPU::FLAT_LOAD_DWORDX3;
+ case 4:
+ return AMDGPU::FLAT_LOAD_DWORDX4;
+ }
+ case FLAT_STORE:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::FLAT_STORE_DWORDX2;
+ case 3:
+ return AMDGPU::FLAT_STORE_DWORDX3;
+ case 4:
+ return AMDGPU::FLAT_STORE_DWORDX4;
+ }
case MIMG:
assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
"No overlaps");
@@ -1508,15 +1699,9 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
std::pair<unsigned, unsigned>
SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
const CombineInfo &Paired) {
- bool ReverseOrder;
- if (CI.InstClass == MIMG) {
- assert(
- (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
- "No overlaps");
- ReverseOrder = CI.DMask > Paired.DMask;
- } else {
- ReverseOrder = CI.Offset > Paired.Offset;
- }
+ assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) ==
+ CI.Width + Paired.Width)) &&
+ "No overlaps");
unsigned Idx0;
unsigned Idx1;
@@ -1532,7 +1717,7 @@ SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
assert(CI.Width >= 1 && CI.Width <= 4);
assert(Paired.Width >= 1 && Paired.Width <= 4);
- if (ReverseOrder) {
+ if (Paired < CI) {
Idx1 = Idxs[0][Paired.Width - 1];
Idx0 = Idxs[Paired.Width][CI.Width - 1];
} else {
@@ -1569,7 +1754,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
@@ -1586,13 +1771,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
.add(*Src0)
.addImm(SubRegIdx0)
.add(*Src1)
.addImm(SubRegIdx1);
- auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1606,9 +1791,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
- const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
MachineInstr *New =
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1616,9 +1798,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
.addImm(CI.CPol) // cpol
.addImm(0) // tfe
.addImm(0) // swz
- .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
-
- moveInstsAfter(MIB, InstsToMove);
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1846,7 +2026,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
// from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
// has 13bit distance from &a + 4096. The heuristic considers &a + 8192
// as the new-base(anchor) because of the maximum distance which can
- // accomodate more intermediate bases presumeably.
+ // accommodate more intermediate bases presumably.
//
// Step3: move (&a + 8192) above load1. Compute and promote offsets from
// (&a + 8192) for load1, load2, load4.
@@ -2098,8 +2278,8 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
CombineInfo &CI = *First;
CombineInfo &Paired = *Second;
- SmallVector<MachineInstr *, 8> InstsToMove;
- if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
+ CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
+ if (!Where) {
++I;
continue;
}
@@ -2108,66 +2288,56 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
+ MachineBasicBlock::iterator NewMI;
switch (CI.InstClass) {
default:
llvm_unreachable("unknown InstClass");
break;
- case DS_READ: {
- MachineBasicBlock::iterator NewMI =
- mergeRead2Pair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
+ case DS_READ:
+ NewMI = mergeRead2Pair(CI, Paired, Where->I);
break;
- }
- case DS_WRITE: {
- MachineBasicBlock::iterator NewMI =
- mergeWrite2Pair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
+ case DS_WRITE:
+ NewMI = mergeWrite2Pair(CI, Paired, Where->I);
break;
- }
- case S_BUFFER_LOAD_IMM: {
- MachineBasicBlock::iterator NewMI =
- mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
+ case S_BUFFER_LOAD_IMM:
+ NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 8;
break;
- }
- case BUFFER_LOAD: {
- MachineBasicBlock::iterator NewMI =
- mergeBufferLoadPair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+ case BUFFER_LOAD:
+ NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
- }
- case BUFFER_STORE: {
- MachineBasicBlock::iterator NewMI =
- mergeBufferStorePair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+ case BUFFER_STORE:
+ NewMI = mergeBufferStorePair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
- }
- case MIMG: {
- MachineBasicBlock::iterator NewMI =
- mergeImagePair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+ case MIMG:
+ NewMI = mergeImagePair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
- }
- case TBUFFER_LOAD: {
- MachineBasicBlock::iterator NewMI =
- mergeTBufferLoadPair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+ case TBUFFER_LOAD:
+ NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
- }
- case TBUFFER_STORE: {
- MachineBasicBlock::iterator NewMI =
- mergeTBufferStorePair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+ case TBUFFER_STORE:
+ NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
+ break;
+ case FLAT_LOAD:
+ case GLOBAL_LOAD:
+ case GLOBAL_LOAD_SADDR:
+ NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
+ break;
+ case FLAT_STORE:
+ case GLOBAL_STORE:
+ case GLOBAL_STORE_SADDR:
+ NewMI = mergeFlatStorePair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
}
- }
- CI.Order = Paired.Order;
+ CI.setMI(NewMI, *this);
+ CI.Order = Where->Order;
if (I == Second)
I = Next;
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index e1018bdfde46..607383ab8cde 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -509,8 +509,35 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec)
.addReg(Exec)
.add(MI.getOperand(0));
- if (LV)
- LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *NewMI);
+ if (LV) {
+ LV->replaceKillInstruction(DataReg, MI, *NewMI);
+
+ if (SplitBB != &MBB) {
+ // Track the set of registers defined in the split block so we don't
+ // accidentally add the original block to AliveBlocks.
+ DenseSet<Register> SplitDefs;
+ for (MachineInstr &X : *SplitBB) {
+ for (MachineOperand &Op : X.operands()) {
+ if (Op.isReg() && Op.isDef() && Op.getReg().isVirtual())
+ SplitDefs.insert(Op.getReg());
+ }
+ }
+
+ for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+ Register Reg = Register::index2VirtReg(i);
+ LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+
+ if (VI.AliveBlocks.test(MBB.getNumber()))
+ VI.AliveBlocks.set(SplitBB->getNumber());
+ else {
+ for (MachineInstr *Kill : VI.Kills) {
+ if (Kill->getParent() == SplitBB && !SplitDefs.contains(Reg))
+ VI.AliveBlocks.set(MBB.getNumber());
+ }
+ }
+ }
+ }
+ }
LoweredEndCf.insert(NewMI);
@@ -540,7 +567,7 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
return;
// Make sure we do not modify exec between def and use.
- // A copy with implcitly defined exec inserted earlier is an exclusion, it
+ // A copy with implicitly defined exec inserted earlier is an exclusion, it
// does not really modify exec.
for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
@@ -573,14 +600,14 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
else return;
Register Reg = MI.getOperand(OpToReplace).getReg();
- MI.RemoveOperand(OpToReplace);
+ MI.removeOperand(OpToReplace);
MI.addOperand(Ops[UniqueOpndIdx]);
if (MRI->use_empty(Reg))
MRI->getUniqueVRegDef(Reg)->eraseFromParent();
}
void SILowerControlFlow::optimizeEndCf() {
- // If the only instruction immediately following this END_CF is an another
+ // If the only instruction immediately following this END_CF is another
// END_CF in the only successor we can avoid emitting exec mask restore here.
if (!EnableOptimizeEndCf)
return;
@@ -865,6 +892,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
}
}
+ bool Changed = false;
MachineFunction::iterator NextBB;
for (MachineFunction::iterator BI = MF.begin();
BI != MF.end(); BI = NextBB) {
@@ -886,6 +914,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
case AMDGPU::SI_LOOP:
case AMDGPU::SI_END_CF:
SplitMBB = process(MI);
+ Changed = true;
break;
// FIXME: find a better place for this
@@ -894,6 +923,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
lowerInitExec(MBB, MI);
if (LIS)
LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
+ Changed = true;
break;
default:
@@ -913,5 +943,5 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
LoweredIf.clear();
KillBlocks.clear();
- return true;
+ return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 672266f0c11e..5fb545b50228 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -79,9 +79,9 @@ public:
}
private:
- void lowerCopiesFromI1();
- void lowerPhis();
- void lowerCopiesToI1();
+ bool lowerCopiesFromI1();
+ bool lowerPhis();
+ bool lowerCopiesToI1();
bool isConstantLaneMask(Register Reg, bool &Val) const;
void buildMergeLaneMasks(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
@@ -473,15 +473,17 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
OrN2Op = AMDGPU::S_ORN2_B64;
}
- lowerCopiesFromI1();
- lowerPhis();
- lowerCopiesToI1();
+ bool Changed = false;
+ Changed |= lowerCopiesFromI1();
+ Changed |= lowerPhis();
+ Changed |= lowerCopiesToI1();
+ assert(Changed || ConstrainRegs.empty());
for (unsigned Reg : ConstrainRegs)
MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
ConstrainRegs.clear();
- return true;
+ return Changed;
}
#ifndef NDEBUG
@@ -493,7 +495,8 @@ static bool isVRegCompatibleReg(const SIRegisterInfo &TRI,
}
#endif
-void SILowerI1Copies::lowerCopiesFromI1() {
+bool SILowerI1Copies::lowerCopiesFromI1() {
+ bool Changed = false;
SmallVector<MachineInstr *, 4> DeadCopies;
for (MachineBasicBlock &MBB : *MF) {
@@ -509,6 +512,8 @@ void SILowerI1Copies::lowerCopiesFromI1() {
if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
continue;
+ Changed = true;
+
// Copy into a 32-bit vector register.
LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
DebugLoc DL = MI.getDebugLoc();
@@ -530,9 +535,10 @@ void SILowerI1Copies::lowerCopiesFromI1() {
MI->eraseFromParent();
DeadCopies.clear();
}
+ return Changed;
}
-void SILowerI1Copies::lowerPhis() {
+bool SILowerI1Copies::lowerPhis() {
MachineSSAUpdater SSAUpdater(*MF);
LoopFinder LF(*DT, *PDT);
PhiIncomingAnalysis PIA(*PDT);
@@ -550,6 +556,8 @@ void SILowerI1Copies::lowerPhis() {
Vreg1Phis.push_back(&MI);
}
}
+ if (Vreg1Phis.empty())
+ return false;
MachineBasicBlock *PrevMBB = nullptr;
for (MachineInstr *MI : Vreg1Phis) {
@@ -662,9 +670,11 @@ void SILowerI1Copies::lowerPhis() {
IncomingRegs.clear();
IncomingUpdated.clear();
}
+ return true;
}
-void SILowerI1Copies::lowerCopiesToI1() {
+bool SILowerI1Copies::lowerCopiesToI1() {
+ bool Changed = false;
MachineSSAUpdater SSAUpdater(*MF);
LoopFinder LF(*DT, *PDT);
SmallVector<MachineInstr *, 4> DeadCopies;
@@ -681,6 +691,8 @@ void SILowerI1Copies::lowerCopiesToI1() {
if (!isVreg1(DstReg))
continue;
+ Changed = true;
+
if (MRI->use_empty(DstReg)) {
DeadCopies.push_back(&MI);
continue;
@@ -731,6 +743,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
MI->eraseFromParent();
DeadCopies.clear();
}
+ return Changed;
}
bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const {
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 0fbdbef6fcce..dd881ec42d53 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -20,6 +20,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/InitializePasses.h"
@@ -79,6 +80,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *RI = ST.getRegisterInfo();
MachineBasicBlock::iterator I = SaveBlock.begin();
if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
@@ -89,8 +92,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
MCRegister Reg = CS.getReg();
MachineInstrSpan MIS(I, &SaveBlock);
- const TargetRegisterClass *RC =
- TRI->getMinimalPhysRegClass(Reg, MVT::i32);
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(
+ Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32);
// If this value was already livein, we probably have a direct use of the
// incoming register value, so don't kill at the spill point. This happens
@@ -119,7 +122,8 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *RI = ST.getRegisterInfo();
// Restore all registers immediately before the return and any
// terminators that precede it.
MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator();
@@ -128,8 +132,8 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
for (const CalleeSavedInfo &CI : reverse(CSI)) {
Register Reg = CI.getReg();
- const TargetRegisterClass *RC =
- TRI->getMinimalPhysRegClass(Reg, MVT::i32);
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(
+ Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32);
TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI);
assert(I != RestoreBlock.begin() &&
@@ -321,7 +325,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
// free frame index ids by the later pass(es) like "stack slot coloring"
// which in turn could mess-up with the book keeping of "frame index to VGPR
// lane".
- FuncInfo->removeDeadFrameIndices(MFI);
+ FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false);
MadeChange = true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index cca8565c9ff9..0504c59ebd9e 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -31,6 +31,9 @@ using namespace llvm;
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
+ BufferPSV(static_cast<const AMDGPUTargetMachine &>(MF.getTarget())),
+ ImagePSV(static_cast<const AMDGPUTargetMachine &>(MF.getTarget())),
+ GWSResourcePSV(static_cast<const AMDGPUTargetMachine &>(MF.getTarget())),
PrivateSegmentBuffer(false),
DispatchPtr(false),
QueuePtr(false),
@@ -48,8 +51,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
ImplicitBufferPtr(false),
ImplicitArgPtr(false),
GITPtrHigh(0xffffffff),
- HighBitsOf32BitAddress(0),
- GDSSize(0) {
+ HighBitsOf32BitAddress(0) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const Function &F = MF.getFunction();
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
@@ -74,6 +76,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
}
+ MayNeedAGPRs = ST.hasMAIInsts();
+
if (!isEntryFunction()) {
if (CC != CallingConv::AMDGPU_Gfx)
ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
@@ -97,6 +101,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
ImplicitArgPtr = false;
MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
MaxKernArgAlign);
+
+ if (ST.hasGFX90AInsts() &&
+ ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
+ !mayUseAGPRs(MF))
+ MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
}
bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
@@ -177,9 +186,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (!S.empty())
S.consumeInteger(0, HighBitsOf32BitAddress);
- S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
- if (!S.empty())
- S.consumeInteger(0, GDSSize);
+ // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
+ // VGPR available at all times. For now, reserve highest available VGPR. After
+ // RA, shift it to the lowest available unused VGPR if the one exist.
+ if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
+ VGPRForAGPRCopy =
+ AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
+ }
+}
+
+MachineFunctionInfo *SIMachineFunctionInfo::clone(
+ BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+ const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+ const {
+ return DestMF.cloneInfo<SIMachineFunctionInfo>(*this);
}
void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
@@ -265,7 +285,7 @@ bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF,
/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
int FI) {
- std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
+ std::vector<SIRegisterInfo::SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
// This has already been allocated.
if (!SpillLanes.empty())
@@ -320,7 +340,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, SpillFI));
- // Add this register as live-in to all blocks to avoid machine verifer
+ // Add this register as live-in to all blocks to avoid machine verifier
// complaining about use of an undefined physical register.
for (MachineBasicBlock &BB : MF)
BB.addLiveIn(LaneVGPR);
@@ -328,7 +348,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
LaneVGPR = SpillVGPRs.back().VGPR;
}
- SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
+ SpillLanes.push_back(SIRegisterInfo::SpilledReg(LaneVGPR, VGPRIndex));
}
return true;
@@ -402,7 +422,8 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
return Spill.FullyAllocated;
}
-void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
+bool SIMachineFunctionInfo::removeDeadFrameIndices(
+ MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
// Remove dead frame indices from function frame, however keep FP & BP since
// spills for them haven't been inserted yet. And also make sure to remove the
// frame indices from `SGPRToVGPRSpills` data structure, otherwise, it could
@@ -415,17 +436,42 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
}
}
- // All other SPGRs must be allocated on the default stack, so reset the stack
- // ID.
- for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
- ++i)
- if (i != FramePointerSaveIndex && i != BasePointerSaveIndex)
- MFI.setStackID(i, TargetStackID::Default);
+ bool HaveSGPRToMemory = false;
+
+ if (ResetSGPRSpillStackIDs) {
+ // All other SPGRs must be allocated on the default stack, so reset the
+ // stack ID.
+ for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
+ ++i) {
+ if (i != FramePointerSaveIndex && i != BasePointerSaveIndex) {
+ if (MFI.getStackID(i) == TargetStackID::SGPRSpill) {
+ MFI.setStackID(i, TargetStackID::Default);
+ HaveSGPRToMemory = true;
+ }
+ }
+ }
+ }
for (auto &R : VGPRToAGPRSpills) {
if (R.second.IsDead)
MFI.RemoveStackObject(R.first);
}
+
+ return HaveSGPRToMemory;
+}
+
+void SIMachineFunctionInfo::allocateWWMReservedSpillSlots(
+ MachineFrameInfo &MFI, const SIRegisterInfo &TRI) {
+ assert(WWMReservedFrameIndexes.empty());
+
+ WWMReservedFrameIndexes.resize(WWMReservedRegs.size());
+
+ int I = 0;
+ for (Register VGPR : WWMReservedRegs) {
+ const TargetRegisterClass *RC = TRI.getPhysRegClass(VGPR);
+ WWMReservedFrameIndexes[I++] = MFI.CreateSpillStackObject(
+ TRI.getSpillSize(*RC), TRI.getSpillAlign(*RC));
+ }
}
int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI,
@@ -539,6 +585,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
const llvm::MachineFunction &MF)
: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
+ GDSSize(MFI.getGDSSize()),
DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
@@ -549,7 +596,14 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
+ BytesInStackArgArea(MFI.getBytesInStackArgArea()),
+ ReturnsVoid(MFI.returnsVoid()),
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), Mode(MFI.getMode()) {
+ for (Register Reg : MFI.WWMReservedRegs)
+ WWMReservedRegs.push_back(regToString(Reg, TRI));
+
+ if (MFI.getVGPRForAGPRCopy())
+ VGPRForAGPRCopy = regToString(MFI.getVGPRForAGPRCopy(), TRI);
auto SFI = MFI.getOptionalScavengeFI();
if (SFI)
ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo());
@@ -563,8 +617,9 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) {
ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
- MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign);
+ MaxKernArgAlign = YamlMFI.MaxKernArgAlign;
LDSSize = YamlMFI.LDSSize;
+ GDSSize = YamlMFI.GDSSize;
DynLDSAlign = YamlMFI.DynLDSAlign;
HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
Occupancy = YamlMFI.Occupancy;
@@ -574,6 +629,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
WaveLimiter = YamlMFI.WaveLimiter;
HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
+ BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
+ ReturnsVoid = YamlMFI.ReturnsVoid;
if (YamlMFI.ScavengeFI) {
auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
@@ -595,10 +652,47 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
return false;
}
+bool SIMachineFunctionInfo::mayUseAGPRs(const MachineFunction &MF) const {
+ for (const BasicBlock &BB : MF.getFunction()) {
+ for (const Instruction &I : BB) {
+ const auto *CB = dyn_cast<CallBase>(&I);
+ if (!CB)
+ continue;
+
+ if (CB->isInlineAsm()) {
+ const InlineAsm *IA = dyn_cast<InlineAsm>(CB->getCalledOperand());
+ for (const auto &CI : IA->ParseConstraints()) {
+ for (StringRef Code : CI.Codes) {
+ Code.consume_front("{");
+ if (Code.startswith("a"))
+ return true;
+ }
+ }
+ continue;
+ }
+
+ const Function *Callee =
+ dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
+ if (!Callee)
+ return true;
+
+ if (Callee->getIntrinsicID() == Intrinsic::not_intrinsic)
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
if (UsesAGPRs)
return *UsesAGPRs;
+ if (!mayNeedAGPRs()) {
+ UsesAGPRs = false;
+ return false;
+ }
+
if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) ||
MF.getFrameInfo().hasCalls()) {
UsesAGPRs = true;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 8e821274bb77..bebb13cbf09f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -15,9 +15,10 @@
#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUMachineFunction.h"
+#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
-#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/Support/raw_ostream.h"
@@ -39,8 +40,8 @@ public:
};
protected:
- AMDGPUPseudoSourceValue(unsigned Kind, const TargetInstrInfo &TII)
- : PseudoSourceValue(Kind, TII) {}
+ AMDGPUPseudoSourceValue(unsigned Kind, const AMDGPUTargetMachine &TM)
+ : PseudoSourceValue(Kind, TM) {}
public:
bool isConstant(const MachineFrameInfo *) const override {
@@ -60,8 +61,8 @@ public:
class AMDGPUBufferPseudoSourceValue final : public AMDGPUPseudoSourceValue {
public:
- explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII)
- : AMDGPUPseudoSourceValue(PSVBuffer, TII) {}
+ explicit AMDGPUBufferPseudoSourceValue(const AMDGPUTargetMachine &TM)
+ : AMDGPUPseudoSourceValue(PSVBuffer, TM) {}
static bool classof(const PseudoSourceValue *V) {
return V->kind() == PSVBuffer;
@@ -73,8 +74,8 @@ public:
class AMDGPUImagePseudoSourceValue final : public AMDGPUPseudoSourceValue {
public:
// TODO: Is the img rsrc useful?
- explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII)
- : AMDGPUPseudoSourceValue(PSVImage, TII) {}
+ explicit AMDGPUImagePseudoSourceValue(const AMDGPUTargetMachine &TM)
+ : AMDGPUPseudoSourceValue(PSVImage, TM) {}
static bool classof(const PseudoSourceValue *V) {
return V->kind() == PSVImage;
@@ -85,8 +86,8 @@ public:
class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue {
public:
- explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII)
- : AMDGPUPseudoSourceValue(GWSResource, TII) {}
+ explicit AMDGPUGWSResourcePseudoSourceValue(const AMDGPUTargetMachine &TM)
+ : AMDGPUPseudoSourceValue(GWSResource, TM) {}
static bool classof(const PseudoSourceValue *V) {
return V->kind() == GWSResource;
@@ -269,8 +270,9 @@ template <> struct MappingTraits<SIMode> {
struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
uint64_t ExplicitKernArgSize = 0;
- unsigned MaxKernArgAlign = 0;
- unsigned LDSSize = 0;
+ Align MaxKernArgAlign;
+ uint32_t LDSSize = 0;
+ uint32_t GDSSize = 0;
Align DynLDSAlign;
bool IsEntryFunction = false;
bool NoSignedZerosFPMath = false;
@@ -283,13 +285,19 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
// TODO: 10 may be a better default since it's the maximum.
unsigned Occupancy = 0;
+ SmallVector<StringValue> WWMReservedRegs;
+
StringValue ScratchRSrcReg = "$private_rsrc_reg";
StringValue FrameOffsetReg = "$fp_reg";
StringValue StackPtrOffsetReg = "$sp_reg";
+ unsigned BytesInStackArgArea = 0;
+ bool ReturnsVoid = true;
+
Optional<SIArgumentInfo> ArgInfo;
SIMode Mode;
Optional<FrameIndex> ScavengeFI;
+ StringValue VGPRForAGPRCopy;
SIMachineFunctionInfo() = default;
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
@@ -304,8 +312,9 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
static void mapping(IO &YamlIO, SIMachineFunctionInfo &MFI) {
YamlIO.mapOptional("explicitKernArgSize", MFI.ExplicitKernArgSize,
UINT64_C(0));
- YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign, 0u);
+ YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign);
YamlIO.mapOptional("ldsSize", MFI.LDSSize, 0u);
+ YamlIO.mapOptional("gdsSize", MFI.GDSSize, 0u);
YamlIO.mapOptional("dynLDSAlign", MFI.DynLDSAlign, Align());
YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false);
YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false);
@@ -319,12 +328,17 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
StringValue("$fp_reg"));
YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg,
StringValue("$sp_reg"));
+ YamlIO.mapOptional("bytesInStackArgArea", MFI.BytesInStackArgArea, 0u);
+ YamlIO.mapOptional("returnsVoid", MFI.ReturnsVoid, true);
YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
YamlIO.mapOptional("mode", MFI.Mode, SIMode());
YamlIO.mapOptional("highBitsOf32BitAddress",
MFI.HighBitsOf32BitAddress, 0u);
YamlIO.mapOptional("occupancy", MFI.Occupancy, 0);
+ YamlIO.mapOptional("wwmReservedRegs", MFI.WWMReservedRegs);
YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI);
+ YamlIO.mapOptional("vgprForAGPRCopy", MFI.VGPRForAGPRCopy,
+ StringValue()); // Don't print out when it's empty.
}
};
@@ -335,8 +349,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
friend class GCNTargetMachine;
- Register TIDReg = AMDGPU::NoRegister;
-
// Registers that may be reserved for spilling purposes. These may be the same
// as the input registers.
Register ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG;
@@ -377,12 +389,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
// unit. Minimum - first, maximum - second.
std::pair<unsigned, unsigned> WavesPerEU = {0, 0};
- std::unique_ptr<const AMDGPUBufferPseudoSourceValue> BufferPSV;
- std::unique_ptr<const AMDGPUImagePseudoSourceValue> ImagePSV;
- std::unique_ptr<const AMDGPUGWSResourcePseudoSourceValue> GWSResourcePSV;
+ const AMDGPUBufferPseudoSourceValue BufferPSV;
+ const AMDGPUImagePseudoSourceValue ImagePSV;
+ const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
private:
- unsigned LDSWaveSpillSize = 0;
unsigned NumUserSGPRs = 0;
unsigned NumSystemSGPRs = 0;
@@ -422,13 +433,14 @@ private:
// user arguments. This is an offset from the KernargSegmentPtr.
bool ImplicitArgPtr : 1;
+ bool MayNeedAGPRs : 1;
+
// The hard-wired high half of the address of the global information table
// for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since
// current hardware only allows a 16 bit value.
unsigned GITPtrHigh;
unsigned HighBitsOf32BitAddress;
- unsigned GDSSize;
// Current recorded maximum possible occupancy.
unsigned Occupancy;
@@ -440,17 +452,6 @@ private:
MCPhysReg getNextSystemSGPR() const;
public:
- struct SpilledReg {
- Register VGPR;
- int Lane = -1;
-
- SpilledReg() = default;
- SpilledReg(Register R, int L) : VGPR (R), Lane (L) {}
-
- bool hasLane() { return Lane != -1;}
- bool hasReg() { return VGPR != 0;}
- };
-
struct SGPRSpillVGPR {
// VGPR used for SGPR spills
Register VGPR;
@@ -468,14 +469,28 @@ public:
bool IsDead = false;
};
- // Map WWM VGPR to a stack slot that is used to save/restore it in the
- // prolog/epilog.
- MapVector<Register, Optional<int>> WWMReservedRegs;
+ // Track VGPRs reserved for WWM.
+ SmallSetVector<Register, 8> WWMReservedRegs;
+
+ /// Track stack slots used for save/restore of reserved WWM VGPRs in the
+ /// prolog/epilog.
+
+ /// FIXME: This is temporary state only needed in PrologEpilogInserter, and
+ /// doesn't really belong here. It does not require serialization
+ SmallVector<int, 8> WWMReservedFrameIndexes;
+
+ void allocateWWMReservedSpillSlots(MachineFrameInfo &MFI,
+ const SIRegisterInfo &TRI);
+
+ auto wwmAllocation() const {
+ assert(WWMReservedRegs.size() == WWMReservedFrameIndexes.size());
+ return zip(WWMReservedRegs, WWMReservedFrameIndexes);
+ }
private:
// Track VGPR + wave index for each subregister of the SGPR spilled to
// frameindex key.
- DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills;
+ DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRToVGPRSpills;
unsigned NumVGPRSpillLanes = 0;
SmallVector<SGPRSpillVGPR, 2> SpillVGPRs;
@@ -491,6 +506,18 @@ private:
// frame, so save it here and add it to the RegScavenger later.
Optional<int> ScavengeFI;
+private:
+ Register VGPRForAGPRCopy;
+
+public:
+ Register getVGPRForAGPRCopy() const {
+ return VGPRForAGPRCopy;
+ }
+
+ void setVGPRForAGPRCopy(Register NewVGPRForAGPRCopy) {
+ VGPRForAGPRCopy = NewVGPRForAGPRCopy;
+ }
+
public: // FIXME
/// If this is set, an SGPR used for save/restore of the register used for the
/// frame pointer.
@@ -506,31 +533,32 @@ public: // FIXME
public:
SIMachineFunctionInfo(const MachineFunction &MF);
+ SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI) = default;
+
+ MachineFunctionInfo *
+ clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+ const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+ const override;
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI,
const MachineFunction &MF,
PerFunctionMIParsingState &PFS,
SMDiagnostic &Error, SMRange &SourceRange);
- void reserveWWMRegister(Register Reg, Optional<int> FI) {
- WWMReservedRegs.insert(std::make_pair(Reg, FI));
+ void reserveWWMRegister(Register Reg) {
+ WWMReservedRegs.insert(Reg);
}
- ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
+ ArrayRef<SIRegisterInfo::SpilledReg>
+ getSGPRToVGPRSpills(int FrameIndex) const {
auto I = SGPRToVGPRSpills.find(FrameIndex);
- return (I == SGPRToVGPRSpills.end()) ?
- ArrayRef<SpilledReg>() : makeArrayRef(I->second);
+ return (I == SGPRToVGPRSpills.end())
+ ? ArrayRef<SIRegisterInfo::SpilledReg>()
+ : makeArrayRef(I->second);
}
ArrayRef<SGPRSpillVGPR> getSGPRSpillVGPRs() const { return SpillVGPRs; }
- void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) {
- SpillVGPRs[Index].VGPR = NewVGPR;
- SpillVGPRs[Index].FI = newFI;
- }
-
- bool removeVGPRForSGPRSpill(Register ReservedVGPR, MachineFunction &MF);
-
ArrayRef<MCPhysReg> getAGPRSpillVGPRs() const {
return SpillAGPR;
}
@@ -555,15 +583,15 @@ public:
unsigned NumLane) const;
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
- void removeDeadFrameIndices(MachineFrameInfo &MFI);
+
+ /// If \p ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill
+ /// to the default stack.
+ bool removeDeadFrameIndices(MachineFrameInfo &MFI,
+ bool ResetSGPRSpillStackIDs);
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI);
Optional<int> getOptionalScavengeFI() const { return ScavengeFI; }
- bool hasCalculatedTID() const { return TIDReg != 0; };
- Register getTIDReg() const { return TIDReg; };
- void setTIDReg(Register Reg) { TIDReg = Reg; }
-
unsigned getBytesInStackArgArea() const {
return BytesInStackArgArea;
}
@@ -581,6 +609,13 @@ public:
Register addFlatScratchInit(const SIRegisterInfo &TRI);
Register addImplicitBufferPtr(const SIRegisterInfo &TRI);
+ /// Increment user SGPRs used for padding the argument list only.
+ Register addReservedUserSGPR() {
+ Register Next = getNextUserSGPR();
+ ++NumUserSGPRs;
+ return Next;
+ }
+
// Add system SGPRs.
Register addWorkGroupIDX() {
ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
@@ -722,10 +757,6 @@ public:
return HighBitsOf32BitAddress;
}
- unsigned getGDSSize() const {
- return GDSSize;
- }
-
unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}
@@ -903,31 +934,19 @@ public:
llvm_unreachable("unexpected dimension");
}
- unsigned getLDSWaveSpillSize() const {
- return LDSWaveSpillSize;
+ const AMDGPUBufferPseudoSourceValue *
+ getBufferPSV(const AMDGPUTargetMachine &TM) {
+ return &BufferPSV;
}
- const AMDGPUBufferPseudoSourceValue *getBufferPSV(const SIInstrInfo &TII) {
- if (!BufferPSV)
- BufferPSV = std::make_unique<AMDGPUBufferPseudoSourceValue>(TII);
-
- return BufferPSV.get();
- }
-
- const AMDGPUImagePseudoSourceValue *getImagePSV(const SIInstrInfo &TII) {
- if (!ImagePSV)
- ImagePSV = std::make_unique<AMDGPUImagePseudoSourceValue>(TII);
-
- return ImagePSV.get();
+ const AMDGPUImagePseudoSourceValue *
+ getImagePSV(const AMDGPUTargetMachine &TM) {
+ return &ImagePSV;
}
- const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
- if (!GWSResourcePSV) {
- GWSResourcePSV =
- std::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
- }
-
- return GWSResourcePSV.get();
+ const AMDGPUGWSResourcePseudoSourceValue *
+ getGWSPSV(const AMDGPUTargetMachine &TM) {
+ return &GWSResourcePSV;
}
unsigned getOccupancy() const {
@@ -953,6 +972,14 @@ public:
limitOccupancy(MF);
}
+ bool mayNeedAGPRs() const {
+ return MayNeedAGPRs;
+ }
+
+ // \returns true if a function has a use of AGPRs via inline asm or
+ // has a call which may use it.
+ bool mayUseAGPRs(const MachineFunction &MF) const;
+
// \returns true if a function needs or may need AGPRs.
bool usesAGPRs(const MachineFunction &MF) const;
};
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 81db66a98ddf..e426e938b856 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -64,7 +64,7 @@ using namespace llvm;
// First the instructions are put into blocks.
// We want the blocks help control register usage and hide high latencies
// later. To help control register usage, we typically want all local
-// computations, when for example you create a result that can be comsummed
+// computations, when for example you create a result that can be consumed
// right away, to be contained in a block. Block inputs and outputs would
// typically be important results that are needed in several locations of
// the shader. Since we do want blocks to help hide high latencies, we want
@@ -90,8 +90,8 @@ using namespace llvm;
// Increasing the number of active wavefronts helps hide the former, but it
// doesn't solve the latter, thus why even if wavefront count is high, we have
// to try have as many instructions hiding high latencies as possible.
-// The OpenCL doc says for example latency of 400 cycles for a global mem access,
-// which is hidden by 10 instructions if the wavefront count is 10.
+// The OpenCL doc says for example latency of 400 cycles for a global mem
+// access, which is hidden by 10 instructions if the wavefront count is 10.
// Some figures taken from AMD docs:
// Both texture and constant L1 caches are 4-way associative with 64 bytes
@@ -353,7 +353,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
// able to correctly handle 5 vs 6, 2 vs 3.
// (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
// The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
- // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
+ // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
// The use of findDefBetween removes the case 4.
for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
Register Reg = RegMaskPair.RegUnit;
@@ -402,7 +402,7 @@ void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
nodeScheduled(SU);
}
- // TODO: compute InternalAdditionnalPressure.
+ // TODO: compute InternalAdditionalPressure.
InternalAdditionalPressure.resize(TopPressure.MaxSetPressure.size());
// Check everything is right.
@@ -696,7 +696,7 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() {
bool HasSubGraph;
std::vector<int> SubGraph;
// By construction (topological order), if SU and
- // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
+ // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary
// in the parent graph of SU.
#ifndef NDEBUG
SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
@@ -1123,36 +1123,26 @@ void SIScheduleBlockCreator::colorExports() {
for (unsigned SUNum : DAG->TopDownIndex2SU) {
const SUnit &SU = DAG->SUnits[SUNum];
if (SIInstrInfo::isEXP(*SU.getInstr())) {
- // Check the EXP can be added to the group safely,
- // ie without needing any other instruction.
- // The EXP is allowed to depend on other EXP
- // (they will be in the same group).
- for (unsigned j : ExpGroup) {
- bool HasSubGraph;
- std::vector<int> SubGraph;
- // By construction (topological order), if SU and
- // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
- // in the parent graph of SU.
-#ifndef NDEBUG
- SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
- HasSubGraph);
- assert(!HasSubGraph);
-#endif
- SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
- HasSubGraph);
- if (!HasSubGraph)
- continue; // No dependencies between each other
+ // SU is an export instruction. Check whether one of its successor
+ // dependencies is a non-export, in which case we skip export grouping.
+ for (const SDep &SuccDep : SU.Succs) {
+ const SUnit *SuccSU = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || SuccSU->NodeNum >= DAG->SUnits.size()) {
+ // Ignore these dependencies.
+ continue;
+ }
+ assert(SuccSU->isInstr() &&
+ "SUnit unexpectedly not representing an instruction!");
- // SubGraph contains all the instructions required
- // between EXP SUnits[j] and EXP SU.
- for (unsigned k : SubGraph) {
- if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
- // Other instructions than EXP would be required in the group.
- // Abort the groupping.
- return;
+ if (!SIInstrInfo::isEXP(*SuccSU->getInstr())) {
+ // A non-export depends on us. Skip export grouping.
+ // Note that this is a bit pessimistic: We could still group all other
+ // exports that are not depended on by non-exports, directly or
+ // indirectly. Simply skipping this particular export but grouping all
+ // others would not account for indirect dependencies.
+ return;
}
}
-
ExpGroup.push_back(SUNum);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index fff4f6729c99..8a66213931ff 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -19,6 +19,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/BitmaskEnum.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/TargetParser.h"
@@ -63,7 +64,7 @@ enum class SIAtomicScope {
};
/// The distinct address spaces supported by the AMDGPU target for
-/// atomic memory operation. Can be ORed toether.
+/// atomic memory operation. Can be ORed together.
enum class SIAtomicAddrSpace {
NONE = 0u,
GLOBAL = 1u << 0,
@@ -459,6 +460,56 @@ public:
Position Pos) const override;
};
+class SIGfx940CacheControl : public SIGfx90ACacheControl {
+protected:
+
+ /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit(MI, AMDGPU::CPol::SC0);
+ }
+
+ /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit(MI, AMDGPU::CPol::SC1);
+ }
+
+ /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit(MI, AMDGPU::CPol::NT);
+ }
+
+public:
+
+ SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
+
+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile,
+ bool IsNonTemporal) const override;
+
+ bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+
+ bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
+};
+
class SIGfx10CacheControl : public SIGfx7CacheControl {
protected:
@@ -494,6 +545,20 @@ public:
Position Pos) const override;
};
+class SIGfx11CacheControl : public SIGfx10CacheControl {
+public:
+ SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
+
+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile,
+ bool IsNonTemporal) const override;
+};
+
class SIMemoryLegalizer final : public MachineFunctionPass {
private:
@@ -649,7 +714,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
return None;
}
- SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
+ SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
@@ -668,7 +733,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
return None;
}
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
- ScopeOrNone.getValue();
+ *ScopeOrNone;
if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
@@ -730,7 +795,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
bool IsCrossAddressSpaceOrdering = false;
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
- ScopeOrNone.getValue();
+ *ScopeOrNone;
if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
@@ -775,13 +840,17 @@ bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
/* static */
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
+ if (ST.hasGFX940Insts())
+ return std::make_unique<SIGfx940CacheControl>(ST);
if (ST.hasGFX90AInsts())
return std::make_unique<SIGfx90ACacheControl>(ST);
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX10)
return std::make_unique<SIGfx7CacheControl>(ST);
- return std::make_unique<SIGfx10CacheControl>(ST);
+ if (Generation < AMDGPUSubtarget::GFX11)
+ return std::make_unique<SIGfx10CacheControl>(ST);
+ return std::make_unique<SIGfx11CacheControl>(ST);
}
bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -943,7 +1012,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// The LDS keeps all memory operations in order for
- // the same wavesfront.
+ // the same wavefront.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
@@ -1360,7 +1429,9 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
// to initiate writeback of any dirty cache lines of earlier writes by the
// same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
// writeback has completed.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
// Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
// vmcnt(0)" needed by the "BUFFER_WBL2".
Changed = true;
@@ -1386,6 +1457,308 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
return Changed;
}
+bool SIGfx940CacheControl::enableLoadCacheBypass(
+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && !MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC bits to indicate system scope.
+ Changed |= enableSC0Bit(MI);
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::AGENT:
+ // Set SC bits to indicate agent scope.
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In threadgroup split mode the waves of a work-group can be executing on
+ // different CUs. Therefore need to bypass the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be bypassed. Setting SC
+ // bits to indicate work-group scope will do this automatically.
+ Changed |= enableSC0Bit(MI);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Leave SC bits unset to indicate wavefront scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::enableStoreCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
+ assert(!MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC bits to indicate system scope.
+ Changed |= enableSC0Bit(MI);
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::AGENT:
+ // Set SC bits to indicate agent scope.
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // Set SC bits to indicate workgroup scope.
+ Changed |= enableSC0Bit(MI);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Leave SC bits unset to indicate wavefront scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::enableRMWCacheBypass(
+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC1 bit to indicate system scope.
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // RMW atomic operations implicitly bypass the L1 cache and only use SC1
+ // to indicate system or agent scope. The SC0 bit is used to indicate if
+ // they are return or no-return. Leave SC1 bit unset to indicate agent
+ // scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
+ MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile, bool IsNonTemporal) const {
+ // Only handle load and store, not atomic read-modify-write insructions. The
+ // latter use glc to indicate if the atomic returns a result and so must not
+ // be used for cache control.
+ assert(MI->mayLoad() ^ MI->mayStore());
+
+ // Only update load and store, not LLVM IR atomic read-modify-write
+ // instructions. The latter are always marked as volatile so cannot sensibly
+ // handle it as do not want to pessimize all atomics. Also they do not support
+ // the nontemporal attribute.
+ assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
+ bool Changed = false;
+
+ if (IsVolatile) {
+ // Set SC bits to indicate system scope.
+ Changed |= enableSC0Bit(MI);
+ Changed |= enableSC1Bit(MI);
+
+ // Ensure operation has completed at system scope to cause all volatile
+ // operations to be visible outside the program in a global order. Do not
+ // request cross address space as only the global address space can be
+ // observable outside the program, so no need to cause a waitcnt for LDS
+ // address space operations.
+ Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+ Position::AFTER);
+
+ return Changed;
+ }
+
+ if (IsNonTemporal) {
+ Changed |= enableNTBit(MI);
+ return Changed;
+ }
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ if (!InsertCacheInv)
+ return false;
+
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Ensures that following loads will not see stale remote VMEM data or
+ // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
+ // CC will never be stale due to the local memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+ // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
+ // remove any cache lines of earlier writes by the same wave and ensures
+ // later reads by the same wave will refetch the cache lines.
+ Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ // Ensures that following loads will not see stale remote date or local
+ // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
+ // due to the memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate agent scope.
+ .addImm(AMDGPU::CPol::SC1);
+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+ // does not reorder memory operations with respect to preceeding buffer
+ // invalidate. The invalidate is guaranteed to remove any cache lines of
+ // earlier writes and ensures later writes will refetch the cache lines.
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In threadgroup split mode the waves of a work-group can be executing on
+ // different CUs. Therefore need to invalidate the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be invalidated.
+ if (ST.isTgSplitEnabled()) {
+ // Ensures L1 is invalidated if in threadgroup split mode. In
+ // non-threadgroup split mode it is a NOP, but no point generating it in
+ // that case if know not in that mode.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate work-group scope.
+ .addImm(AMDGPU::CPol::SC0);
+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+ // does not reorder memory operations with respect to preceeding buffer
+ // invalidate. The invalidate is guaranteed to remove any cache lines of
+ // earlier writes and ensures later writes will refetch the cache lines.
+ Changed = true;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Could generate "BUFFER_INV" but it would do nothing as there are no
+ // caches to invalidate.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
+ // to initiate writeback of any dirty cache lines of earlier writes by the
+ // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
+ // writeback has completed.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+ // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
+ // SIAtomicScope::SYSTEM, the following insertWait will generate the
+ // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
+ Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate agent scope.
+ .addImm(AMDGPU::CPol::SC1);
+
+ // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
+ // SIAtomicScope::AGENT, the following insertWait will generate the
+ // required "S_WAITCNT vmcnt(0)".
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Do not generate "BUFFER_WBL2" as there are no caches it would
+ // writeback, and would require an otherwise unnecessary
+ // "S_WAITCNT vmcnt(0)".
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
+ // S_WAITCNT needed.
+ Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+ IsCrossAddrSpaceOrdering, Pos);
+
+ return Changed;
+}
+
bool SIGfx10CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -1547,7 +1920,7 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// The LDS keeps all memory operations in order for
- // the same wavesfront.
+ // the same wavefront.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
@@ -1655,6 +2028,101 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}
+bool SIGfx11CacheControl::enableLoadCacheBypass(
+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && !MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ // Set the L0 and L1 cache policies to MISS_EVICT.
+ // Note: there is no L2 cache coherent bypass control at the ISA level.
+ Changed |= enableGLCBit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
+ // CU mode all waves of a work-group are on the same CU, and so the L0
+ // does not need to be bypassed.
+ if (!ST.isCuModeEnabled())
+ Changed |= enableGLCBit(MI);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ return Changed;
+}
+
+bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
+ MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile, bool IsNonTemporal) const {
+
+ // Only handle load and store, not atomic read-modify-write insructions. The
+ // latter use glc to indicate if the atomic returns a result and so must not
+ // be used for cache control.
+ assert(MI->mayLoad() ^ MI->mayStore());
+
+ // Only update load and store, not LLVM IR atomic read-modify-write
+ // instructions. The latter are always marked as volatile so cannot sensibly
+ // handle it as do not want to pessimize all atomics. Also they do not support
+ // the nontemporal attribute.
+ assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
+ bool Changed = false;
+
+ if (IsVolatile) {
+ // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
+ // and MISS_LRU for store instructions.
+ // Note: there is no L2 cache coherent bypass control at the ISA level.
+ if (Op == SIMemOp::LOAD)
+ Changed |= enableGLCBit(MI);
+
+ // Set MALL NOALLOC for load and store instructions.
+ Changed |= enableDLCBit(MI);
+
+ // Ensure operation has completed at system scope to cause all volatile
+ // operations to be visible outside the program in a global order. Do not
+ // request cross address space as only the global address space can be
+ // observable outside the program, so no need to cause a waitcnt for LDS
+ // address space operations.
+ Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+ Position::AFTER);
+ return Changed;
+ }
+
+ if (IsNonTemporal) {
+ // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
+ // and L2 cache policy to STREAM.
+ // For stores setting both GLC and SLC configures L0 and L1 cache policy
+ // to MISS_EVICT and the L2 cache policy to STREAM.
+ if (Op == SIMemOp::STORE)
+ Changed |= enableGLCBit(MI);
+ Changed |= enableSLCBit(MI);
+
+ // Set MALL NOALLOC for load and store instructions.
+ Changed |= enableDLCBit(MI);
+ return Changed;
+ }
+
+ return Changed;
+}
+
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index 24a8879b5684..a5816e2e8c73 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -17,6 +17,7 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
#include <queue>
#define DEBUG_TYPE "si-mode-register"
@@ -162,7 +163,9 @@ FunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); }
// double precision setting.
Status SIModeRegister::getInstructionMode(MachineInstr &MI,
const SIInstrInfo *TII) {
- if (TII->usesFPDPRounding(MI)) {
+ if (TII->usesFPDPRounding(MI) ||
+ MI.getOpcode() == AMDGPU::FPTRUNC_UPWARD_PSEUDO ||
+ MI.getOpcode() == AMDGPU::FPTRUNC_DOWNWARD_PSEUDO) {
switch (MI.getOpcode()) {
case AMDGPU::V_INTERP_P1LL_F16:
case AMDGPU::V_INTERP_P1LV_F16:
@@ -170,6 +173,18 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI,
// f16 interpolation instructions need double precision round to zero
return Status(FP_ROUND_MODE_DP(3),
FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO));
+ case AMDGPU::FPTRUNC_UPWARD_PSEUDO: {
+ // Replacing the pseudo by a real instruction
+ MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e32));
+ return Status(FP_ROUND_MODE_DP(3),
+ FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_INF));
+ }
+ case AMDGPU::FPTRUNC_DOWNWARD_PSEUDO: {
+ // Replacing the pseudo by a real instruction
+ MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e32));
+ return Status(FP_ROUND_MODE_DP(3),
+ FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEGINF));
+ }
default:
return DefaultStatus;
}
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index b9c839fe28ba..5215397d5936 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -9,6 +9,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
@@ -292,6 +293,210 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
return false;
}
+// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
+// the beginning of the BB is reached or Pred evaluates to true - which can be
+// an arbitrary condition based on the current MachineInstr, for instance an
+// target instruction. Breaks prematurely by returning nullptr if one of the
+// registers given in NonModifiableRegs is modified by the current instruction.
+static MachineInstr *
+findInstrBackwards(MachineInstr &Origin,
+ std::function<bool(MachineInstr *)> Pred,
+ ArrayRef<MCRegister> NonModifiableRegs,
+ const SIRegisterInfo *TRI, unsigned MaxInstructions = 20) {
+ MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
+ E = Origin.getParent()->rend();
+ unsigned CurrentIteration = 0;
+
+ for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
+ if (A->isDebugInstr())
+ continue;
+
+ if (Pred(&*A))
+ return &*A;
+
+ for (MCRegister Reg : NonModifiableRegs) {
+ if (A->modifiesRegister(Reg, TRI))
+ return nullptr;
+ }
+
+ ++CurrentIteration;
+ }
+
+ return nullptr;
+}
+
+
+// Determine if a register Reg is not re-defined and still in use
+// in the range (Stop..Start].
+// It does so by backwards calculating liveness from the end of the BB until
+// either Stop or the beginning of the BB is reached.
+// After liveness is calculated, we can determine if Reg is still in use and not
+// defined inbetween the instructions.
+static bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
+ MCRegister Reg, const SIRegisterInfo *TRI,
+ MachineRegisterInfo &MRI,
+ bool useLiveOuts = false,
+ bool ignoreStart = false) {
+ LivePhysRegs LR(*TRI);
+ if (useLiveOuts)
+ LR.addLiveOuts(*Stop.getParent());
+
+ MachineBasicBlock::reverse_iterator A(Start);
+ MachineBasicBlock::reverse_iterator E(Stop);
+
+ if (ignoreStart)
+ ++A;
+
+ for (; A != Stop.getParent()->rend() && A != Stop; ++A) {
+ LR.stepBackward(*A);
+ }
+
+ return !LR.available(MRI, Reg);
+}
+
+// Determine if a register Reg is not re-defined and still in use
+// in the range (Stop..BB.end].
+static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg,
+ const SIRegisterInfo *TRI,
+ MachineRegisterInfo &MRI) {
+ return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, TRI,
+ MRI, true);
+}
+
+// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
+// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
+// to the v_cmp instruction if it is safe to replace the sequence (see the
+// conditions in the function body). This is after register allocation, so some
+// checks on operand dependencies need to be considered.
+static MachineInstr *findPossibleVCMPVCMPXOptimization(
+ MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI,
+ const SIInstrInfo *TII, MachineRegisterInfo &MRI) {
+
+ MachineInstr *VCmp = nullptr;
+
+ Register SaveExecDest = SaveExec.getOperand(0).getReg();
+ if (!TRI->isSGPRReg(MRI, SaveExecDest))
+ return nullptr;
+
+ MachineOperand *SaveExecSrc0 =
+ TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
+ if (!SaveExecSrc0->isReg())
+ return nullptr;
+
+ // Try to find the last v_cmp instruction that defs the saveexec input
+ // operand without any write to Exec or the saveexec input operand inbetween.
+ VCmp = findInstrBackwards(
+ SaveExec,
+ [&](MachineInstr *Check) {
+ return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
+ Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
+ },
+ {Exec, SaveExecSrc0->getReg()}, TRI);
+
+ if (!VCmp)
+ return nullptr;
+
+ MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
+ assert(VCmpDest && "Should have an sdst operand!");
+
+ // Check if any of the v_cmp source operands is written by the saveexec.
+ MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
+ if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) &&
+ SaveExec.modifiesRegister(Src0->getReg(), TRI))
+ return nullptr;
+
+ MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
+ if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) &&
+ SaveExec.modifiesRegister(Src1->getReg(), TRI))
+ return nullptr;
+
+ // Don't do the transformation if the destination operand is included in
+ // it's MBB Live-outs, meaning it's used in any of it's successors, leading
+ // to incorrect code if the v_cmp and therefore the def of
+ // the dest operand is removed.
+ if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
+ return nullptr;
+
+ // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
+ // s_and_saveexec, skip the optimization.
+ if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), TRI, MRI,
+ false, true) ||
+ isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI))
+ return nullptr;
+
+ // Try to determine if there is a write to any of the VCmp
+ // operands between the saveexec and the vcmp.
+ // If yes, additional VGPR spilling might need to be inserted. In this case,
+ // it's not worth replacing the instruction sequence.
+ SmallVector<MCRegister, 2> NonDefRegs;
+ if (Src0->isReg())
+ NonDefRegs.push_back(Src0->getReg());
+
+ if (Src1->isReg())
+ NonDefRegs.push_back(Src1->getReg());
+
+ if (!findInstrBackwards(
+ SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
+ NonDefRegs, TRI))
+ return nullptr;
+
+ return VCmp;
+}
+
+// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
+// operands extracted from a v_cmp ..., s_and_saveexec pattern.
+static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
+ MachineInstr &VCmp, MCRegister Exec,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ MachineRegisterInfo &MRI) {
+ const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
+
+ if (NewOpcode == -1)
+ return false;
+
+ MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
+
+ Register MoveDest = SaveExecInstr.getOperand(0).getReg();
+
+ MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
+ if (!SaveExecInstr.uses().empty()) {
+ bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32;
+ unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
+ SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
+ .addReg(Exec);
+ }
+
+ // Omit dst as V_CMPX is implicitly writing to EXEC.
+ // Add dummy src and clamp modifiers, if needed.
+ auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
+ VCmp.getDebugLoc(), TII->get(NewOpcode));
+
+ auto TryAddImmediateValueFromNamedOperand =
+ [&](unsigned OperandName) -> void {
+ if (auto *Mod = TII->getNamedOperand(VCmp, OperandName))
+ Builder.addImm(Mod->getImm());
+ };
+
+ TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
+ Builder.add(*Src0);
+
+ TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
+ Builder.add(*Src1);
+
+ TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
+
+ // The kill flags may no longer be correct.
+ if (Src0->isReg())
+ MRI.clearKillFlags(Src0->getReg());
+ if (Src1->isReg())
+ MRI.clearKillFlags(Src1->getReg());
+
+ return true;
+}
+
bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@@ -299,6 +504,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Optimize sequences emitted for control flow lowering. They are originally
@@ -312,6 +518,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
// x = s_<op>_saveexec_b64 y
//
+ bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB);
MachineBasicBlock::reverse_iterator E = MBB.rend();
@@ -351,6 +558,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
CopyToExecInst->eraseFromParent();
+ Changed = true;
}
continue;
@@ -456,8 +664,49 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
OtherInst->substituteRegister(CopyToExec, Exec,
AMDGPU::NoSubRegister, *TRI);
}
+
+ Changed = true;
}
- return true;
+ // After all s_op_saveexec instructions are inserted,
+ // replace (on GFX10.3 and later)
+ // v_cmp_* SGPR, IMM, VGPR
+ // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
+ // with
+ // s_mov_b32 EXEC_SGPR_DEST, exec_lo
+ // v_cmpx_* IMM, VGPR
+ // to reduce pipeline stalls.
+ if (ST.hasGFX10_3Insts()) {
+ DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
+ const unsigned AndSaveExecOpcode =
+ ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ // Record relevant v_cmp / s_and_saveexec instruction pairs for
+ // replacement.
+ if (MI.getOpcode() != AndSaveExecOpcode)
+ continue;
+
+ if (MachineInstr *VCmp =
+ findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI))
+ SaveExecVCmpMapping[&MI] = VCmp;
+ }
+ }
+
+ for (const auto &Entry : SaveExecVCmpMapping) {
+ MachineInstr *SaveExecInstr = Entry.getFirst();
+ MachineInstr *VCmpInstr = Entry.getSecond();
+
+ if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII,
+ TRI, *MRI)) {
+ SaveExecInstr->eraseFromParent();
+ VCmpInstr->eraseFromParent();
+
+ Changed = true;
+ }
+ }
+ }
+ return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 5f89f3826683..e5e65a8dbbf1 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -39,7 +39,7 @@ private:
MCRegister CondReg;
MCRegister ExecReg;
- Register optimizeVcndVcmpPair(MachineBasicBlock &MBB);
+ bool optimizeVcndVcmpPair(MachineBasicBlock &MBB);
bool optimizeElseBranch(MachineBasicBlock &MBB);
public:
@@ -90,8 +90,8 @@ static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx,
static bool isDefBetween(const SIRegisterInfo &TRI,
LiveIntervals *LIS, Register Reg,
const MachineInstr &Sel, const MachineInstr &And) {
- SlotIndex AndIdx = LIS->getInstructionIndex(And);
- SlotIndex SelIdx = LIS->getInstructionIndex(Sel);
+ SlotIndex AndIdx = LIS->getInstructionIndex(And).getRegSlot();
+ SlotIndex SelIdx = LIS->getInstructionIndex(Sel).getRegSlot();
if (Reg.isVirtual())
return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx);
@@ -119,21 +119,20 @@ static bool isDefBetween(const SIRegisterInfo &TRI,
// required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
// lanes.
//
-// Returns %cc register on success.
-Register
-SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
+// Returns true on success.
+bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
return Opc == AMDGPU::S_CBRANCH_VCCZ ||
Opc == AMDGPU::S_CBRANCH_VCCNZ; });
if (I == MBB.terminators().end())
- return Register();
+ return false;
auto *And =
TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, *I, *MRI, LIS);
if (!And || And->getOpcode() != AndOpc ||
!And->getOperand(1).isReg() || !And->getOperand(2).isReg())
- return Register();
+ return false;
MachineOperand *AndCC = &And->getOperand(1);
Register CmpReg = AndCC->getReg();
@@ -143,49 +142,49 @@ SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
CmpReg = AndCC->getReg();
CmpSubReg = AndCC->getSubReg();
} else if (And->getOperand(2).getReg() != Register(ExecReg)) {
- return Register();
+ return false;
}
auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, *MRI, LIS);
if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
Cmp->getParent() != And->getParent())
- return Register();
+ return false;
MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
if (Op1->isImm() && Op2->isReg())
std::swap(Op1, Op2);
if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
- return Register();
+ return false;
Register SelReg = Op1->getReg();
auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS);
if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
- return Register();
+ return false;
if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
- return Register();
+ return false;
Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
Op1->getImm() != 0 || Op2->getImm() != 1)
- return Register();
+ return false;
Register CCReg = CC->getReg();
// If there was a def between the select and the and, we would need to move it
// to fold this.
if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And))
- return Register();
+ return false;
+ // TODO: Guard against implicit def operands?
LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
<< *And);
- LIS->RemoveMachineInstrFromMaps(*And);
MachineInstr *Andn2 =
BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc),
And->getOperand(0).getReg())
@@ -196,34 +195,92 @@ SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
MachineOperand &Andn2SCC = Andn2->getOperand(3);
assert(Andn2SCC.getReg() == AMDGPU::SCC);
Andn2SCC.setIsDead(AndSCC.isDead());
+
+ SlotIndex AndIdx = LIS->ReplaceMachineInstrInMaps(*And, *Andn2);
And->eraseFromParent();
- LIS->InsertMachineInstrInMaps(*Andn2);
LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
+ SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp);
+ SlotIndex SelIdx = LIS->getInstructionIndex(*Sel);
+
+ LiveInterval *CmpLI =
+ CmpReg.isVirtual() ? &LIS->getInterval(CmpReg) : nullptr;
+ LiveInterval *SelLI =
+ SelReg.isVirtual() ? &LIS->getInterval(SelReg) : nullptr;
+
+ // Update live intervals for CCReg before potentially removing CmpReg/SelReg,
+ // and their associated liveness information.
+ if (CCReg.isVirtual()) {
+ // Note: this ignores that SelLI might have multiple internal values
+ // or splits and simply extends the live range to cover all cases
+ // where the result of the v_cndmask_b32 was live (e.g. loops).
+ // This could yield worse register allocation in rare edge cases.
+ SlotIndex EndIdx = AndIdx.getRegSlot();
+ if (SelLI && SelLI->endIndex() > EndIdx && SelLI->endIndex().isBlock())
+ EndIdx = SelLI->endIndex();
+
+ LiveInterval &CCLI = LIS->getInterval(CCReg);
+ auto CCQ = CCLI.Query(SelIdx.getRegSlot());
+ if (CCQ.valueIn()) {
+ CCLI.addSegment(LiveRange::Segment(SelIdx.getRegSlot(),
+ EndIdx, CCQ.valueIn()));
+ }
+
+ if (CC->getSubReg()) {
+ LaneBitmask Mask = TRI->getSubRegIndexLaneMask(CC->getSubReg());
+ BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+ CCLI.refineSubRanges(
+ Allocator, Mask,
+ [=](LiveInterval::SubRange &SR) {
+ auto CCQS = SR.Query(SelIdx.getRegSlot());
+ if (CCQS.valueIn()) {
+ SR.addSegment(LiveRange::Segment(
+ SelIdx.getRegSlot(), EndIdx, CCQS.valueIn()));
+ }
+ },
+ *LIS->getSlotIndexes(), *TRI);
+ CCLI.removeEmptySubRanges();
+
+ SmallVector<LiveInterval *> SplitLIs;
+ LIS->splitSeparateComponents(CCLI, SplitLIs);
+ }
+ } else
+ LIS->removeAllRegUnitsForPhysReg(CCReg);
+
// Try to remove compare. Cmp value should not used in between of cmp
// and s_and_b64 if VCC or just unused if any other register.
- if ((CmpReg.isVirtual() && MRI->use_nodbg_empty(CmpReg)) ||
+ if ((CmpReg.isVirtual() && CmpLI && CmpLI->Query(AndIdx.getRegSlot()).isKill()) ||
(CmpReg == Register(CondReg) &&
std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
[&](const MachineInstr &MI) {
return MI.readsRegister(CondReg, TRI);
}))) {
LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
-
+ if (CmpLI)
+ LIS->removeVRegDefAt(*CmpLI, CmpIdx.getRegSlot());
LIS->RemoveMachineInstrFromMaps(*Cmp);
Cmp->eraseFromParent();
// Try to remove v_cndmask_b32.
- if (SelReg.isVirtual() && MRI->use_nodbg_empty(SelReg)) {
- LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
+ if (SelLI) {
+ bool CanRemoveSel = SelLI->Query(CmpIdx.getRegSlot()).isKill();
+ if (!CanRemoveSel) {
+ // Try to shrink the live interval and check for dead def instead.
+ LIS->shrinkToUses(SelLI, nullptr);
+ CanRemoveSel = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
+ }
+ if (CanRemoveSel) {
+ LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
- LIS->RemoveMachineInstrFromMaps(*Sel);
- Sel->eraseFromParent();
+ LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
+ LIS->RemoveMachineInstrFromMaps(*Sel);
+ Sel->eraseFromParent();
+ }
}
}
- return CCReg;
+ return true;
}
// Optimize sequence
@@ -330,8 +387,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
Changed = true;
}
- if (Register Reg = optimizeVcndVcmpPair(MBB)) {
- RecalcRegs.insert(Reg);
+ if (optimizeVcndVcmpPair(MBB)) {
RecalcRegs.insert(AMDGPU::VCC_LO);
RecalcRegs.insert(AMDGPU::VCC_HI);
RecalcRegs.insert(AMDGPU::SCC);
@@ -402,7 +458,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
}
// If the only user of a logical operation is move to exec, fold it now
- // to prevent forming of saveexec. I.e:
+ // to prevent forming of saveexec. I.e.:
//
// %0:sreg_64 = COPY $exec
// %1:sreg_64 = S_AND_B64 %0:sreg_64, %2:sreg_64
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index e13e33ed5457..2ae3157bab49 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -112,8 +112,10 @@ public:
SmallVectorImpl<Register> &CandidateRegs) const;
void collectWaterfallCandidateRegisters(
- MachineBasicBlock *Loop,
- SmallSetVector<Register, 16> &CandidateRegs) const;
+ MachineBasicBlock *LoopHeader, MachineBasicBlock *LoopEnd,
+ SmallSetVector<Register, 16> &CandidateRegs,
+ SmallSetVector<MachineBasicBlock *, 2> &Blocks,
+ SmallVectorImpl<MachineInstr *> &Instructions) const;
void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB,
SmallVectorImpl<MachineInstr *> &Uses) const;
@@ -131,7 +133,10 @@ public:
MachineBasicBlock *Flow, MachineBasicBlock *Endif,
SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const;
- void optimizeWaterfallLiveRange(Register Reg, MachineBasicBlock *If) const;
+ void optimizeWaterfallLiveRange(
+ Register Reg, MachineBasicBlock *LoopHeader,
+ SmallSetVector<MachineBasicBlock *, 2> &LoopBlocks,
+ SmallVectorImpl<MachineInstr *> &Instructions) const;
SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {}
@@ -323,12 +328,34 @@ void SIOptimizeVGPRLiveRange::collectCandidateRegisters(
/// Collect the registers used in the waterfall loop block that are defined
/// before.
void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
- MachineBasicBlock *Loop,
- SmallSetVector<Register, 16> &CandidateRegs) const {
+ MachineBasicBlock *LoopHeader, MachineBasicBlock *LoopEnd,
+ SmallSetVector<Register, 16> &CandidateRegs,
+ SmallSetVector<MachineBasicBlock *, 2> &Blocks,
+ SmallVectorImpl<MachineInstr *> &Instructions) const {
- for (auto &MI : Loop->instrs()) {
- if (MI.isDebugInstr())
- continue;
+ // Collect loop instructions, potentially spanning multiple blocks
+ auto *MBB = LoopHeader;
+ for (;;) {
+ Blocks.insert(MBB);
+ for (auto &MI : *MBB) {
+ if (MI.isDebugInstr())
+ continue;
+ Instructions.push_back(&MI);
+ }
+ if (MBB == LoopEnd)
+ break;
+
+ if ((MBB != LoopHeader && MBB->pred_size() != 1) ||
+ (MBB == LoopHeader && MBB->pred_size() != 2) || MBB->succ_size() != 1) {
+ LLVM_DEBUG(dbgs() << "Unexpected edges in CFG, ignoring loop\n");
+ return;
+ }
+
+ MBB = *MBB->succ_begin();
+ }
+
+ for (auto *I : Instructions) {
+ auto &MI = *I;
for (auto &MO : MI.operands()) {
if (!MO.isReg() || !MO.getReg() || MO.isDef())
@@ -340,16 +367,17 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
continue;
if (MO.readsReg()) {
- const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
+ MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
// Make sure the value is defined before the LOOP block
- if (DefMBB != Loop && !CandidateRegs.contains(MOReg)) {
+ if (!Blocks.contains(DefMBB) && !CandidateRegs.contains(MOReg)) {
// If the variable is used after the loop, the register coalescer will
// merge the newly created register and remove the phi node again.
// Just do nothing in that case.
LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(MOReg);
bool IsUsed = false;
- for (auto *Succ : Loop->successors()) {
- if (Succ != Loop && OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) {
+ for (auto *Succ : LoopEnd->successors()) {
+ if (!Blocks.contains(Succ) &&
+ OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) {
IsUsed = true;
break;
}
@@ -513,7 +541,9 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange(
}
void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange(
- Register Reg, MachineBasicBlock *Loop) const {
+ Register Reg, MachineBasicBlock *LoopHeader,
+ SmallSetVector<MachineBasicBlock *, 2> &Blocks,
+ SmallVectorImpl<MachineInstr *> &Instructions) const {
// Insert a new PHI, marking the value from the last loop iteration undef.
LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n');
const auto *RC = MRI->getRegClass(Reg);
@@ -525,15 +555,16 @@ void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange(
for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) {
auto *UseMI = O.getParent();
auto *UseBlock = UseMI->getParent();
- // Replace uses in Loop block
- if (UseBlock == Loop)
+ // Replace uses in Loop blocks
+ if (Blocks.contains(UseBlock))
O.setReg(NewReg);
}
- MachineInstrBuilder PHI = BuildMI(*Loop, Loop->getFirstNonPHI(), DebugLoc(),
- TII->get(TargetOpcode::PHI), NewReg);
- for (auto *Pred : Loop->predecessors()) {
- if (Pred == Loop)
+ MachineInstrBuilder PHI =
+ BuildMI(*LoopHeader, LoopHeader->getFirstNonPHI(), DebugLoc(),
+ TII->get(TargetOpcode::PHI), NewReg);
+ for (auto *Pred : LoopHeader->predecessors()) {
+ if (Blocks.contains(Pred))
PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred);
else
PHI.addReg(Reg).addMBB(Pred);
@@ -542,21 +573,36 @@ void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange(
LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg);
LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
- // collectWaterfallCandidateRegisters only collects registers that are dead
- // after the loop. So we know that the old reg is not live throughout the
- // whole block anymore.
- OldVarInfo.AliveBlocks.reset(Loop->getNumber());
-
- // Mark the last use as kill
- for (auto &MI : reverse(Loop->instrs())) {
- if (MI.readsRegister(NewReg, TRI)) {
- MI.addRegisterKilled(NewReg, TRI);
- NewVarInfo.Kills.push_back(&MI);
+ // Find last use and mark as kill
+ MachineInstr *Kill = nullptr;
+ for (auto *MI : reverse(Instructions)) {
+ if (MI->readsRegister(NewReg, TRI)) {
+ MI->addRegisterKilled(NewReg, TRI);
+ NewVarInfo.Kills.push_back(MI);
+ Kill = MI;
break;
}
}
- assert(!NewVarInfo.Kills.empty() &&
- "Failed to find last usage of register in loop");
+ assert(Kill && "Failed to find last usage of register in loop");
+
+ MachineBasicBlock *KillBlock = Kill->getParent();
+ bool PostKillBlock = false;
+ for (auto *Block : Blocks) {
+ auto BBNum = Block->getNumber();
+
+ // collectWaterfallCandidateRegisters only collects registers that are dead
+ // after the loop. So we know that the old reg is no longer live throughout
+ // the waterfall loop.
+ OldVarInfo.AliveBlocks.reset(BBNum);
+
+ // The new register is live up to (and including) the block that kills it.
+ PostKillBlock |= (Block == KillBlock);
+ if (PostKillBlock) {
+ NewVarInfo.AliveBlocks.reset(BBNum);
+ } else if (Block != LoopHeader) {
+ NewVarInfo.AliveBlocks.set(BBNum);
+ }
+ }
}
char SIOptimizeVGPRLiveRange::ID = 0;
@@ -601,6 +647,10 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
if (!Endif)
continue;
+ // Skip unexpected control flow.
+ if (!MDT->dominates(&MBB, IfTarget) || !MDT->dominates(IfTarget, Endif))
+ continue;
+
SmallSetVector<MachineBasicBlock *, 16> ElseBlocks;
SmallVector<Register> CandidateRegs;
@@ -620,15 +670,22 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
for (auto Reg : CandidateRegs)
optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks);
} else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) {
+ auto *LoopHeader = MI.getOperand(0).getMBB();
+ auto *LoopEnd = &MBB;
+
LLVM_DEBUG(dbgs() << "Checking Waterfall loop: "
- << printMBBReference(MBB) << '\n');
+ << printMBBReference(*LoopHeader) << '\n');
SmallSetVector<Register, 16> CandidateRegs;
- collectWaterfallCandidateRegisters(&MBB, CandidateRegs);
+ SmallVector<MachineInstr *, 16> Instructions;
+ SmallSetVector<MachineBasicBlock *, 2> Blocks;
+
+ collectWaterfallCandidateRegisters(LoopHeader, LoopEnd, CandidateRegs,
+ Blocks, Instructions);
MadeChange |= !CandidateRegs.empty();
// Now we are safe to optimize.
for (auto Reg : CandidateRegs)
- optimizeWaterfallLiveRange(Reg, &MBB);
+ optimizeWaterfallLiveRange(Reg, LoopHeader, Blocks, Instructions);
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index da41a5e2478a..e768a2f3e1a5 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -316,7 +316,7 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
}
if (Abs || Neg) {
assert(!Sext &&
- "Float and integer src modifiers can't be set simulteniously");
+ "Float and integer src modifiers can't be set simultaneously");
Mods |= Abs ? SISrcMods::ABS : 0u;
Mods ^= Neg ? SISrcMods::NEG : 0u;
} else if (Sext) {
@@ -1131,16 +1131,16 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
bool Converted = false;
for (auto &Operand : SDWAOperands) {
LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
- // There should be no intesection between SDWA operands and potential MIs
+ // There should be no intersection between SDWA operands and potential MIs
// e.g.:
// v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
// v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
// v_add_u32 v3, v4, v2
//
- // In that example it is possible that we would fold 2nd instruction into 3rd
- // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
- // already destroyed). So if SDWAOperand is also a potential MI then do not
- // apply it.
+ // In that example it is possible that we would fold 2nd instruction into
+ // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that
+ // was already destroyed). So if SDWAOperand is also a potential MI then do
+ // not apply it.
if (PotentialMatches.count(Operand->getParentInst()) == 0)
Converted |= Operand->convertToSDWA(*SDWAInst, TII);
}
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index c2e2875ed6bf..4fab13bb44b1 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -18,7 +18,10 @@
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/InitializePasses.h"
using namespace llvm;
@@ -85,9 +88,6 @@ FunctionPass *llvm::createSIPreAllocateWWMRegsPass() {
}
bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
- if (!MO.isReg())
- return false;
-
Register Reg = MO.getReg();
if (Reg.isPhysical())
return false;
@@ -111,7 +111,6 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
}
llvm_unreachable("physreg not found for WWM expression");
- return false;
}
void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
@@ -142,7 +141,6 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
}
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- MachineFrameInfo &FrameInfo = MF.getFrameInfo();
for (unsigned Reg : RegsToRewrite) {
LIS->removeInterval(Reg);
@@ -150,18 +148,7 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
const Register PhysReg = VRM->getPhys(Reg);
assert(PhysReg != 0);
- // Check if PhysReg is already reserved
- if (!MFI->WWMReservedRegs.count(PhysReg)) {
- Optional<int> FI;
- if (!MFI->isEntryFunction()) {
- // Create a stack object for a possible spill in the function prologue.
- // Note: Non-CSR VGPR also need this as we may overwrite inactive lanes.
- const TargetRegisterClass *RC = TRI->getPhysRegClass(PhysReg);
- FI = FrameInfo.CreateSpillStackObject(TRI->getSpillSize(*RC),
- TRI->getSpillAlign(*RC));
- }
- MFI->reserveWWMRegister(PhysReg, FI);
- }
+ MFI->reserveWWMRegister(PhysReg);
}
RegsToRewrite.clear();
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index b0e45dd3e3e3..8d33b8a1fd4b 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -74,6 +74,15 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
// We end up with this pattern sometimes after basic block placement.
// It happens while combining a block which assigns -1 or 0 to a saved mask
// and another block which consumes that saved mask and then a branch.
+ //
+ // While searching this also performs the following substitution:
+ // vcc = V_CMP
+ // vcc = S_AND exec, vcc
+ // S_CBRANCH_VCC[N]Z
+ // =>
+ // vcc = V_CMP
+ // S_CBRANCH_VCC[N]Z
+
bool Changed = false;
MachineBasicBlock &MBB = *MI.getParent();
const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
@@ -121,19 +130,32 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
SReg = Op2.getReg();
auto M = std::next(A);
bool ReadsSreg = false;
+ bool ModifiesExec = false;
for (; M != E; ++M) {
if (M->definesRegister(SReg, TRI))
break;
if (M->modifiesRegister(SReg, TRI))
return Changed;
ReadsSreg |= M->readsRegister(SReg, TRI);
+ ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
+ }
+ if (M == E)
+ return Changed;
+ // If SReg is VCC and SReg definition is a VALU comparison.
+ // This means S_AND with EXEC is not required.
+ // Erase the S_AND and return.
+ // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
+ if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
+ TII->isVOPC(*M)) {
+ A->eraseFromParent();
+ return true;
}
- if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
+ if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
(M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
return Changed;
MaskValue = M->getOperand(1).getImm();
// First if sreg is only used in the AND instruction fold the immediate
- // into into the AND.
+ // into the AND.
if (!ReadsSreg && Op2.isKill()) {
A->getOperand(2).ChangeToImmediate(MaskValue);
M->eraseFromParent();
@@ -213,7 +235,7 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
}
- MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
+ MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
MI.addImplicitDefUseOperands(*MBB.getParent());
return true;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 21aed4ececb5..ad1455ed20fd 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -19,7 +19,9 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
using namespace llvm;
@@ -182,6 +184,16 @@ struct SGPRSpillBuilder {
TmpVGPRLive = true;
}
+ if (TmpVGPRLive) {
+ // We need to inform the scavenger that this index is already in use until
+ // we're done with the custom emergency spill.
+ RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
+ }
+
+ // We may end up recursively calling the scavenger, and don't want to re-use
+ // the same register.
+ RS->setRegUsed(TmpVGPR);
+
// Try to scavenge SGPRs to save exec
assert(!SavedExecReg && "Exec is already saved, refuse to save again");
const TargetRegisterClass &RC =
@@ -202,6 +214,12 @@ struct SGPRSpillBuilder {
// Spill needed lanes
TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
} else {
+ // The modify and restore of exec clobber SCC, which we would have to save
+ // and restore. FIXME: We probably would need to reserve a register for
+ // this.
+ if (RS->isRegUsed(AMDGPU::SCC))
+ MI->emitError("unhandled SGPR spill to memory");
+
// Spill active lanes
if (TmpVGPRLive)
TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
@@ -251,6 +269,12 @@ struct SGPRSpillBuilder {
if (TmpVGPRLive)
TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
}
+
+ // Inform the scavenger where we're releasing our custom scavenged register.
+ if (TmpVGPRLive) {
+ MachineBasicBlock::iterator RestorePt = std::prev(MI);
+ RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
+ }
}
// Write TmpVGPR to memory or read TmpVGPR from memory.
@@ -265,6 +289,12 @@ struct SGPRSpillBuilder {
// Spill needed lanes
TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
} else {
+ // The modify and restore of exec clobber SCC, which we would have to save
+ // and restore. FIXME: We probably would need to reserve a register for
+ // this.
+ if (RS->isRegUsed(AMDGPU::SCC))
+ MI->emitError("unhandled SGPR spill to memory");
+
// Spill active lanes
TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
/*IsKill*/ false);
@@ -329,7 +359,7 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
static auto InitializeSubRegFromChannelTableOnce = [this]() {
for (auto &Row : SubRegFromChannelTable)
Row.fill(AMDGPU::NoSubRegister);
- for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
+ for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32;
unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32;
assert(Width < SubRegFromChannelTableWidthMap.size());
@@ -364,13 +394,11 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::Cold:
- return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
- ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList
- : CSR_AMDGPU_HighRegs_SaveList;
+ return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
+ : CSR_AMDGPU_SaveList;
case CallingConv::AMDGPU_Gfx:
- return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
- ? CSR_AMDGPU_SI_Gfx_With_AGPRs_SaveList
- : CSR_AMDGPU_SI_Gfx_SaveList;
+ return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
+ : CSR_AMDGPU_SI_Gfx_SaveList;
default: {
// Dummy to not crash RegisterClassInfo.
static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
@@ -390,13 +418,11 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::Cold:
- return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
- ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask
- : CSR_AMDGPU_HighRegs_RegMask;
+ return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
+ : CSR_AMDGPU_RegMask;
case CallingConv::AMDGPU_Gfx:
- return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
- ? CSR_AMDGPU_SI_Gfx_With_AGPRs_RegMask
- : CSR_AMDGPU_SI_Gfx_RegMask;
+ return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
+ : CSR_AMDGPU_SI_Gfx_RegMask;
default:
return nullptr;
}
@@ -413,8 +439,7 @@ SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
// equivalent AV class. If used one, the verifier will crash after
// RegBankSelect in the GISel flow. The aligned regclasses are not fully given
// until Instruction selection.
- if (MF.getSubtarget<GCNSubtarget>().hasMAIInsts() &&
- (isVGPRClass(RC) || isAGPRClass(RC))) {
+ if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
return &AMDGPU::AV_32RegClass;
if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
@@ -463,8 +488,7 @@ SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
}
Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
- const SIFrameLowering *TFI =
- MF.getSubtarget<GCNSubtarget>().getFrameLowering();
+ const SIFrameLowering *TFI = ST.getFrameLowering();
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
// During ISel lowering we always reserve the stack pointer in entry
// functions, but never actually want to reference it when accessing our own
@@ -487,19 +511,19 @@ bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
- return CSR_AMDGPU_AllVGPRs_RegMask;
+ return AMDGPU_AllVGPRs_RegMask;
}
const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
- return CSR_AMDGPU_AllAGPRs_RegMask;
+ return AMDGPU_AllAGPRs_RegMask;
}
const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
- return CSR_AMDGPU_AllVectorRegs_RegMask;
+ return AMDGPU_AllVectorRegs_RegMask;
}
const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
- return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
+ return AMDGPU_AllAllocatableSRegs_RegMask;
}
unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
@@ -522,6 +546,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::MODE);
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ // Reserve special purpose registers.
+ //
// EXEC_LO and EXEC_HI could be allocated and used as regular register, but
// this seems likely to result in bugs, so I'm marking them as reserved.
reserveRegisterTuples(Reserved, AMDGPU::EXEC);
@@ -563,7 +591,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
// Reserve null register - it shall never be allocated
- reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL);
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
// Disallow vcc_hi allocation in wave32. It may be allocated but most likely
// will result in bugs.
@@ -572,6 +600,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(AMDGPU::VCC_HI);
}
+ // Reserve SGPRs.
+ //
unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
@@ -579,39 +609,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, Reg);
}
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
- unsigned MaxNumAGPRs = MaxNumVGPRs;
- unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
-
- if (ST.hasGFX90AInsts()) {
- // In an entry function without calls and AGPRs used it is possible to use
- // the whole register budget for VGPRs.
-
- // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and
- // split register file accordingly.
- if (MFI->usesAGPRs(MF)) {
- MaxNumVGPRs /= 2;
- MaxNumAGPRs = MaxNumVGPRs;
- } else {
- if (MaxNumVGPRs > TotalNumVGPRs) {
- MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
- MaxNumVGPRs = TotalNumVGPRs;
- } else
- MaxNumAGPRs = 0;
- }
- }
-
- for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
- unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
- reserveRegisterTuples(Reserved, Reg);
- }
-
- for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) {
- unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
- reserveRegisterTuples(Reserved, Reg);
- }
-
for (auto Reg : AMDGPU::SReg_32RegClass) {
Reserved.set(getSubReg(Reg, AMDGPU::hi16));
Register Low = getSubReg(Reg, AMDGPU::lo16);
@@ -620,22 +617,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(Low);
}
- for (auto Reg : AMDGPU::AGPR_32RegClass) {
- Reserved.set(getSubReg(Reg, AMDGPU::hi16));
- }
-
- // Reserve all the rest AGPRs if there are no instructions to use it.
- if (!ST.hasMAIInsts()) {
- for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
- unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
- reserveRegisterTuples(Reserved, Reg);
- }
- }
-
Register ScratchRSrcReg = MFI->getScratchRSrcReg();
if (ScratchRSrcReg != AMDGPU::NoRegister) {
- // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
- // to spill.
+ // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
+ // need to spill.
// TODO: May need to reserve a VGPR if doing LDS spilling.
reserveRegisterTuples(Reserved, ScratchRSrcReg);
}
@@ -644,7 +629,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// which is detected after the function is lowered. If we aren't really going
// to need SP, don't bother reserving it.
MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
-
if (StackPtrReg) {
reserveRegisterTuples(Reserved, StackPtrReg);
assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
@@ -662,20 +646,63 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
}
- for (auto Reg : MFI->WWMReservedRegs) {
- reserveRegisterTuples(Reserved, Reg.first);
+ // Reserve VGPRs/AGPRs.
+ //
+ unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
+ unsigned MaxNumAGPRs = MaxNumVGPRs;
+ unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+
+ // Reserve all the AGPRs if there are no instructions to use it.
+ if (!ST.hasMAIInsts()) {
+ for (unsigned i = 0; i < MaxNumAGPRs; ++i) {
+ unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
+ reserveRegisterTuples(Reserved, Reg);
+ }
}
- // Reserve VGPRs used for SGPR spilling.
- // Note we treat freezeReservedRegs unusually because we run register
- // allocation in two phases. It's OK to re-freeze with new registers for the
- // second run.
-#if 0
- for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) {
- for (auto &SpilledVGPR : SpilledFI.second)
- reserveRegisterTuples(Reserved, SpilledVGPR.VGPR);
+ for (auto Reg : AMDGPU::AGPR_32RegClass) {
+ Reserved.set(getSubReg(Reg, AMDGPU::hi16));
+ }
+
+ // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
+ // a wave may have up to 512 total vector registers combining together both
+ // VGPRs and AGPRs. Hence, in an entry function without calls and without
+ // AGPRs used within it, it is possible to use the whole vector register
+ // budget for VGPRs.
+ //
+ // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
+ // register file accordingly.
+ if (ST.hasGFX90AInsts()) {
+ if (MFI->usesAGPRs(MF)) {
+ MaxNumVGPRs /= 2;
+ MaxNumAGPRs = MaxNumVGPRs;
+ } else {
+ if (MaxNumVGPRs > TotalNumVGPRs) {
+ MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
+ MaxNumVGPRs = TotalNumVGPRs;
+ } else
+ MaxNumAGPRs = 0;
+ }
}
-#endif
+
+ for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
+ unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
+ reserveRegisterTuples(Reserved, Reg);
+ }
+
+ for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) {
+ unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
+ reserveRegisterTuples(Reserved, Reg);
+ }
+
+ // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
+ // VGPR available at all times.
+ if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
+ reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
+ }
+
+ for (Register Reg : MFI->WWMReservedRegs)
+ reserveRegisterTuples(Reserved, Reg);
// FIXME: Stop using reserved registers for this.
for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
@@ -690,6 +717,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
+bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF,
+ MCRegister PhysReg) const {
+ return !MF.getRegInfo().isReserved(PhysReg);
+}
+
bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
// On entry, the base address is 0, so it can't possibly need any more
@@ -1010,6 +1042,8 @@ static int getOffsetMUBUFStore(unsigned Opc) {
return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
+ case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
+ return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
@@ -1035,6 +1069,8 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
+ case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
+ return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
@@ -1054,6 +1090,64 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
}
}
+static int getOffenMUBUFStore(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
+ return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
+ case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
+ return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
+ case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
+ return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
+ case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
+ return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
+ case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
+ case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
+ return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
+ case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
+ return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
+ default:
+ return -1;
+ }
+}
+
+static int getOffenMUBUFLoad(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
+ return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
+ case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
+ return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
+ case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
+ return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
+ case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
+ return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
+ case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
+ return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
+ case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
+ return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
+ case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
+ return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
+ case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
+ return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
+ case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
+ return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
+ case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
+ return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
+ case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
+ return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
+ case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
+ return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
+ case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
+ return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
+ case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
+ return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
+ default:
+ return -1;
+ }
+}
+
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
@@ -1139,8 +1233,9 @@ static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
unsigned LoadStoreOp,
unsigned EltSize) {
bool IsStore = TII->get(LoadStoreOp).mayStore();
+ bool HasVAddr = AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) != -1;
bool UseST =
- AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 &&
+ !HasVAddr &&
AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0;
switch (EltSize) {
@@ -1164,7 +1259,9 @@ static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
llvm_unreachable("Unexpected spill load/store size!");
}
- if (UseST)
+ if (HasVAddr)
+ LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
+ else if (UseST)
LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
return LoadStoreOp;
@@ -1186,6 +1283,7 @@ void SIRegisterInfo::buildSpillLoadStore(
bool IsStore = Desc->mayStore();
bool IsFlat = TII->isFLATScratch(LoadStoreOp);
+ bool CanClobberSCC = false;
bool Scavenged = false;
MCRegister SOffset = ScratchOffsetReg;
@@ -1202,6 +1300,8 @@ void SIRegisterInfo::buildSpillLoadStore(
unsigned RemSize = RegWidth - Size;
unsigned NumRemSubRegs = RemSize ? 1 : 0;
int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
+ int64_t MaterializedOffset = Offset;
+
int64_t MaxOffset = Offset + Size + RemSize - EltSize;
int64_t ScratchOffsetRegDelta = 0;
@@ -1216,6 +1316,42 @@ void SIRegisterInfo::buildSpillLoadStore(
assert((IsFlat || ((Offset % EltSize) == 0)) &&
"unexpected VGPR spill offset");
+ // Track a VGPR to use for a constant offset we need to materialize.
+ Register TmpOffsetVGPR;
+
+ // Track a VGPR to use as an intermediate value.
+ Register TmpIntermediateVGPR;
+ bool UseVGPROffset = false;
+
+ // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
+ // combination.
+ auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
+ int64_t VOffset) {
+ // We are using a VGPR offset
+ if (IsFlat && SGPRBase) {
+ // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
+ // SGPR, so perform the add as vector.
+ // We don't need a base SGPR in the kernel.
+
+ if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
+ .addReg(SGPRBase)
+ .addImm(VOffset)
+ .addImm(0); // clamp
+ } else {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+ .addReg(SGPRBase);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
+ .addImm(VOffset)
+ .addReg(TmpOffsetVGPR);
+ }
+ } else {
+ assert(TmpOffsetVGPR);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+ .addImm(VOffset);
+ }
+ };
+
bool IsOffsetLegal =
IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
SIInstrFlags::FlatScratch)
@@ -1223,17 +1359,17 @@ void SIRegisterInfo::buildSpillLoadStore(
if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
SOffset = MCRegister();
- // We currently only support spilling VGPRs to EltSize boundaries, meaning
- // we can simplify the adjustment of Offset here to just scale with
- // WavefrontSize.
- if (!IsFlat)
- Offset *= ST.getWavefrontSize();
-
// We don't have access to the register scavenger if this function is called
// during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case.
+ // TODO: Clobbering SCC is not necessary for scratch instructions in the
+ // entry.
if (RS) {
SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
+
+ // Piggy back on the liveness scan we just did see if SCC is dead.
+ CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
} else if (LiveRegs) {
+ CanClobberSCC = !LiveRegs->contains(AMDGPU::SCC);
for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
if (LiveRegs->available(MF->getRegInfo(), Reg)) {
SOffset = Reg;
@@ -1242,7 +1378,26 @@ void SIRegisterInfo::buildSpillLoadStore(
}
}
+ if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
+ SOffset = Register();
+
if (!SOffset) {
+ UseVGPROffset = true;
+
+ if (RS) {
+ TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+ } else {
+ assert(LiveRegs);
+ for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
+ if (LiveRegs->available(MF->getRegInfo(), Reg)) {
+ TmpOffsetVGPR = Reg;
+ break;
+ }
+ }
+ }
+
+ assert(TmpOffsetVGPR);
+ } else if (!SOffset && CanClobberSCC) {
// There are no free SGPRs, and since we are in the process of spilling
// VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
// on SI/CI and on VI it is true until we implement spilling using scalar
@@ -1250,6 +1405,9 @@ void SIRegisterInfo::buildSpillLoadStore(
// add the offset directly to the ScratchOffset or StackPtrOffset
// register, and then subtract the offset after the spill to return the
// register to it's original value.
+
+ // TODO: If we don't have to do an emergency stack slot spill, converting
+ // to use the VGPR offset is fewer instructions.
if (!ScratchOffsetReg)
ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
SOffset = ScratchOffsetReg;
@@ -1258,12 +1416,22 @@ void SIRegisterInfo::buildSpillLoadStore(
Scavenged = true;
}
- if (!SOffset)
+ // We currently only support spilling VGPRs to EltSize boundaries, meaning
+ // we can simplify the adjustment of Offset here to just scale with
+ // WavefrontSize.
+ if (!IsFlat && !UseVGPROffset)
+ Offset *= ST.getWavefrontSize();
+
+ if (!UseVGPROffset && !SOffset)
report_fatal_error("could not scavenge SGPR to spill in entry function");
- if (ScratchOffsetReg == AMDGPU::NoRegister) {
+ if (UseVGPROffset) {
+ // We are using a VGPR offset
+ MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
+ } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
} else {
+ assert(Offset != 0);
auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
.addReg(ScratchOffsetReg)
.addImm(Offset);
@@ -1277,13 +1445,16 @@ void SIRegisterInfo::buildSpillLoadStore(
assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
&& "Unexpected vaddr for flat scratch with a FI operand");
- assert(ST.hasFlatScratchSTMode());
- LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
+ if (UseVGPROffset) {
+ LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
+ } else {
+ assert(ST.hasFlatScratchSTMode());
+ LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
+ }
+
Desc = &TII->get(LoadStoreOp);
}
- Register TmpReg;
-
for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
++i, RegOffset += EltSize) {
if (i == NumSubRegs) {
@@ -1292,6 +1463,22 @@ void SIRegisterInfo::buildSpillLoadStore(
}
Desc = &TII->get(LoadStoreOp);
+ if (!IsFlat && UseVGPROffset) {
+ int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
+ : getOffenMUBUFLoad(LoadStoreOp);
+ Desc = &TII->get(NewLoadStoreOp);
+ }
+
+ if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
+ // If we are spilling an AGPR beyond the range of the memory instruction
+ // offset and need to use a VGPR offset, we ideally have at least 2
+ // scratch VGPRs. If we don't have a second free VGPR without spilling,
+ // recycle the VGPR used for the offset which requires resetting after
+ // each subregister.
+
+ MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
+ }
+
unsigned NumRegs = EltSize / 4;
Register SubReg = e == 1
? ValueReg
@@ -1300,7 +1487,8 @@ void SIRegisterInfo::buildSpillLoadStore(
unsigned SOffsetRegState = 0;
unsigned SrcDstRegState = getDefRegState(!IsStore);
- if (i + 1 == e) {
+ const bool IsLastSubReg = i + 1 == e;
+ if (IsLastSubReg) {
SOffsetRegState |= getKillRegState(Scavenged);
// The last implicit use carries the "Kill" flag.
SrcDstRegState |= getKillRegState(IsKill);
@@ -1363,21 +1551,26 @@ void SIRegisterInfo::buildSpillLoadStore(
if (IsAGPR) {
assert(EltSize == 4);
- if (!TmpReg) {
- assert(RS && "Needs to have RegScavenger to spill an AGPR!");
- // FIXME: change to scavengeRegisterBackwards()
- TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
- RS->setRegUsed(TmpReg);
+ if (!TmpIntermediateVGPR) {
+ TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
+ assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
}
if (IsStore) {
auto AccRead = BuildMI(MBB, MI, DL,
- TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg)
+ TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
+ TmpIntermediateVGPR)
.addReg(SubReg, getKillRegState(IsKill));
if (NeedSuperRegDef)
AccRead.addReg(ValueReg, RegState::ImplicitDefine);
AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
}
- SubReg = TmpReg;
+ SubReg = TmpIntermediateVGPR;
+ } else if (UseVGPROffset) {
+ // FIXME: change to scavengeRegisterBackwards()
+ if (!TmpOffsetVGPR) {
+ TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+ RS->setRegUsed(TmpOffsetVGPR);
+ }
}
MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
@@ -1388,12 +1581,26 @@ void SIRegisterInfo::buildSpillLoadStore(
auto MIB =
BuildMI(MBB, MI, DL, *Desc)
.addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
+
+ if (UseVGPROffset) {
+ // For an AGPR spill, we reuse the same temp VGPR for the offset and the
+ // intermediate accvgpr_write.
+ MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
+ }
+
if (!IsFlat)
MIB.addReg(FuncInfo->getScratchRSrcReg());
if (SOffset == AMDGPU::NoRegister) {
- if (!IsFlat)
- MIB.addImm(0);
+ if (!IsFlat) {
+ if (UseVGPROffset && ScratchOffsetReg) {
+ assert(!FuncInfo->isEntryFunction());
+ MIB.addReg(ScratchOffsetReg);
+ } else {
+ assert(FuncInfo->isEntryFunction());
+ MIB.addImm(0);
+ }
+ }
} else {
MIB.addReg(SOffset, SOffsetRegState);
}
@@ -1407,10 +1614,10 @@ void SIRegisterInfo::buildSpillLoadStore(
if (!IsAGPR && NeedSuperRegDef)
MIB.addReg(ValueReg, RegState::ImplicitDefine);
- if (!IsStore && TmpReg != AMDGPU::NoRegister) {
+ if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
FinalReg)
- .addReg(TmpReg, RegState::Kill);
+ .addReg(TmpIntermediateVGPR, RegState::Kill);
MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
}
@@ -1466,8 +1673,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
bool OnlyToVGPR) const {
SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
- ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills =
- SB.MFI.getSGPRToVGPRSpills(Index);
+ ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRToVGPRSpills(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
@@ -1485,7 +1691,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
SB.NumSubRegs == 1
? SB.SuperReg
: Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
- SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
+ SpilledReg Spill = VGPRSpills[i];
bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1;
@@ -1586,8 +1792,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
bool OnlyToVGPR) const {
SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
- ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills =
- SB.MFI.getSGPRToVGPRSpills(Index);
+ ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRToVGPRSpills(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
@@ -1599,7 +1804,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
? SB.SuperReg
: Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
- SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
+ SpilledReg Spill = VGPRSpills[i];
auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
SubReg)
.addReg(Spill.VGPR)
@@ -1937,18 +2142,23 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
Offset = 0;
}
- assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) &&
- "Unexpected vaddr for flat scratch with a FI operand");
-
- // On GFX10 we have ST mode to use no registers for an address.
- // Otherwise we need to materialize 0 into an SGPR.
- if (!Offset && ST.hasFlatScratchSTMode()) {
+ if (!Offset) {
unsigned Opc = MI->getOpcode();
- unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
- MI->RemoveOperand(
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
- MI->setDesc(TII->get(NewOpc));
- return;
+ int NewOpc = -1;
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) != -1) {
+ NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc);
+ } else if (ST.hasFlatScratchSTMode()) {
+ // On GFX10 we have ST mode to use no registers for an address.
+ // Otherwise we need to materialize 0 into an SGPR.
+ NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
+ }
+
+ if (NewOpc != -1) {
+ MI->removeOperand(
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
+ MI->setDesc(TII->get(NewOpc));
+ return;
+ }
}
}
@@ -2026,57 +2236,78 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (!IsMUBUF && !MFI->isEntryFunction()) {
// Convert to a swizzled stack address by scaling by the wave size.
- //
// In an entry function/kernel the offset is already swizzled.
-
- bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
- Register ResultReg =
- IsCopy ? MI->getOperand(0).getReg()
- : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+ bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
+ bool LiveSCC = RS->isRegUsed(AMDGPU::SCC);
+ const TargetRegisterClass *RC = IsSALU && !LiveSCC
+ ? &AMDGPU::SReg_32RegClass
+ : &AMDGPU::VGPR_32RegClass;
+ bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+ MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
+ Register ResultReg = IsCopy ? MI->getOperand(0).getReg()
+ : RS->scavengeRegister(RC, MI, 0);
int64_t Offset = FrameInfo.getObjectOffset(Index);
if (Offset == 0) {
+ unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
+ : AMDGPU::V_LSHRREV_B32_e64;
// XXX - This never happens because of emergency scavenging slot at 0?
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
- .addImm(ST.getWavefrontSizeLog2())
- .addReg(FrameReg);
+ auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg)
+ .addImm(ST.getWavefrontSizeLog2())
+ .addReg(FrameReg);
+ if (IsSALU && !LiveSCC)
+ Shift.getInstr()->getOperand(3).setIsDead(
+ true); // Mark SCC as dead.
+ if (IsSALU && LiveSCC) {
+ Register NewDest =
+ RS->scavengeRegister(&AMDGPU::SReg_32RegClass, Shift, 0);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ NewDest)
+ .addReg(ResultReg);
+ ResultReg = NewDest;
+ }
} else {
- if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
- // Reuse ResultReg in intermediate step.
- Register ScaledReg = ResultReg;
+ MachineInstrBuilder MIB;
+ if (!IsSALU) {
+ if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
+ nullptr) {
+ // Reuse ResultReg in intermediate step.
+ Register ScaledReg = ResultReg;
- BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
- ScaledReg)
- .addImm(ST.getWavefrontSizeLog2())
- .addReg(FrameReg);
+ BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+ ScaledReg)
+ .addImm(ST.getWavefrontSizeLog2())
+ .addReg(FrameReg);
- const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
+ const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
- // TODO: Fold if use instruction is another add of a constant.
- if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
- // FIXME: This can fail
- MIB.addImm(Offset);
- MIB.addReg(ScaledReg, RegState::Kill);
- if (!IsVOP2)
- MIB.addImm(0); // clamp bit
- } else {
- assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
- "Need to reuse carry out register");
+ // TODO: Fold if use instruction is another add of a constant.
+ if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
+ // FIXME: This can fail
+ MIB.addImm(Offset);
+ MIB.addReg(ScaledReg, RegState::Kill);
+ if (!IsVOP2)
+ MIB.addImm(0); // clamp bit
+ } else {
+ assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
+ "Need to reuse carry out register");
- // Use scavenged unused carry out as offset register.
- Register ConstOffsetReg;
- if (!isWave32)
- ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
- else
- ConstOffsetReg = MIB.getReg(1);
+ // Use scavenged unused carry out as offset register.
+ Register ConstOffsetReg;
+ if (!isWave32)
+ ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
+ else
+ ConstOffsetReg = MIB.getReg(1);
- BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
- .addImm(Offset);
- MIB.addReg(ConstOffsetReg, RegState::Kill);
- MIB.addReg(ScaledReg, RegState::Kill);
- MIB.addImm(0); // clamp bit
+ BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
+ .addImm(Offset);
+ MIB.addReg(ConstOffsetReg, RegState::Kill);
+ MIB.addReg(ScaledReg, RegState::Kill);
+ MIB.addImm(0); // clamp bit
+ }
}
- } else {
+ }
+ if (!MIB || IsSALU) {
// We have to produce a carry out, and there isn't a free SGPR pair
// for it. We can keep the whole computation on the SALU to avoid
// clobbering an additional register at the cost of an extra mov.
@@ -2084,7 +2315,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// We may have 1 free scratch SGPR even though a carry out is
// unavailable. Only one additional mov is needed.
Register TmpScaledReg =
- RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+ RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
@@ -2093,14 +2324,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
.addReg(ScaledReg, RegState::Kill)
.addImm(Offset);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
- .addReg(ScaledReg, RegState::Kill);
+ if (!IsSALU)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
+ .addReg(ScaledReg, RegState::Kill);
+ else
+ ResultReg = ScaledReg;
// If there were truly no free SGPRs, we need to undo everything.
if (!TmpScaledReg.isValid()) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
- .addReg(ScaledReg, RegState::Kill)
- .addImm(-Offset);
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(-Offset);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
.addReg(FrameReg)
.addImm(ST.getWavefrontSizeLog2());
@@ -2665,8 +2899,7 @@ MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const
const TargetRegisterClass *
SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
- const RegisterBank &RB,
- const MachineRegisterInfo &MRI) const {
+ const RegisterBank &RB) const {
switch (RB.getID()) {
case AMDGPU::VGPRRegBankID:
return getVGPRClassForBitWidth(std::max(32u, Size));
@@ -2688,7 +2921,7 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
const MachineRegisterInfo &MRI) const {
const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
- return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
+ return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>())
return getAllocatableClass(RC);
@@ -2808,9 +3041,29 @@ bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
return true;
}
+const TargetRegisterClass *
+SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const {
+ if (!RC || !ST.needsAlignedVGPRs())
+ return RC;
+
+ unsigned Size = getRegSizeInBits(*RC);
+ if (Size <= 32)
+ return RC;
+
+ if (isVGPRClass(RC))
+ return getAlignedVGPRClassForBitWidth(Size);
+ if (isAGPRClass(RC))
+ return getAlignedAGPRClassForBitWidth(Size);
+ if (isVectorSuperClass(RC))
+ return getAlignedVectorSuperClassForBitWidth(Size);
+
+ return RC;
+}
+
bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
switch (PhysReg) {
case AMDGPU::SGPR_NULL:
+ case AMDGPU::SGPR_NULL64:
case AMDGPU::SRC_SHARED_BASE:
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_SHARED_LIMIT:
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index f1fe0a1d9329..9bfbc253410b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -51,6 +51,17 @@ private:
public:
SIRegisterInfo(const GCNSubtarget &ST);
+ struct SpilledReg {
+ Register VGPR;
+ int Lane = -1;
+
+ SpilledReg() = default;
+ SpilledReg(Register R, int L) : VGPR(R), Lane(L) {}
+
+ bool hasLane() { return Lane != -1; }
+ bool hasReg() { return VGPR != 0; }
+ };
+
/// \returns the sub reg enum value for the given \p Channel
/// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1);
@@ -64,6 +75,8 @@ public:
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
BitVector getReservedRegs(const MachineFunction &MF) const override;
+ bool isAsmClobberable(const MachineFunction &MF,
+ MCRegister PhysReg) const override;
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
@@ -304,15 +317,11 @@ public:
MCRegister getReturnAddressReg(const MachineFunction &MF) const;
const TargetRegisterClass *
- getRegClassForSizeOnBank(unsigned Size,
- const RegisterBank &Bank,
- const MachineRegisterInfo &MRI) const;
+ getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const;
const TargetRegisterClass *
- getRegClassForTypeOnBank(LLT Ty,
- const RegisterBank &Bank,
- const MachineRegisterInfo &MRI) const {
- return getRegClassForSizeOnBank(Ty.getSizeInBits(), Bank, MRI);
+ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const {
+ return getRegClassForSizeOnBank(Ty.getSizeInBits(), Bank);
}
const TargetRegisterClass *
@@ -377,6 +386,11 @@ public:
// the subtarget.
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const;
+ // Given \p RC returns corresponding aligned register class if required
+ // by the subtarget.
+ const TargetRegisterClass *
+ getProperlyAlignedRC(const TargetRegisterClass *RC) const;
+
/// Return all SGPR128 which satisfy the waves per execution unit requirement
/// of the subtarget.
ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index eb9452f4b85e..ffe8dce79816 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -97,7 +97,7 @@ class RegSeqNames<int last_reg, int stride, int size, string prefix,
[]);
}
-// Generates list of dags for register tupless.
+// Generates list of dags for register tuples.
class RegSeqDags<RegisterClass RC, int last_reg, int stride, int size,
int start = 0> {
dag trunc_rc = (trunc RC,
@@ -189,7 +189,7 @@ def PC_REG : SIReg<"pc", 0>, DwarfRegNum<[16, 16]> {
def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
- let HWEncoding = 106;
+ let HWEncoding = VCC_LO.HWEncoding;
}
defm EXEC_LO : SIRegLoHi16<"exec_lo", 126>, DwarfRegNum<[1, 1]>;
@@ -198,7 +198,7 @@ defm EXEC_HI : SIRegLoHi16<"exec_hi", 127>;
def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegNum<[17, 1]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
- let HWEncoding = 126;
+ let HWEncoding = EXEC_LO.HWEncoding;
}
// 32-bit real registers, for MC only.
@@ -211,8 +211,23 @@ defm SRC_SCC : SIRegLoHi16<"src_scc", 253>;
// Should never be emitted.
def SCC : SIReg<"scc">;
-defm M0 : SIRegLoHi16 <"m0", 124>;
-defm SGPR_NULL : SIRegLoHi16 <"null", 125>;
+// Encoding changes between subtarget generations.
+// See also Utils/AMDGPUBaseInfo.cpp MAP_REG2REG.
+defm M0_gfxpre11 : SIRegLoHi16 <"m0", 124>;
+defm M0_gfx11plus : SIRegLoHi16 <"m0", 125>;
+defm M0 : SIRegLoHi16 <"m0", 0>;
+
+defm SGPR_NULL_gfxpre11 : SIRegLoHi16 <"null", 125>;
+defm SGPR_NULL_gfx11plus : SIRegLoHi16 <"null", 124>;
+defm SGPR_NULL : SIRegLoHi16 <"null", 0>;
+defm SGPR_NULL_HI : SIRegLoHi16 <"", 0>;
+
+def SGPR_NULL64 :
+ RegisterWithSubRegs<"null", [SGPR_NULL, SGPR_NULL_HI]> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = SGPR_NULL.HWEncoding;
+}
defm SRC_SHARED_BASE : SIRegLoHi16<"src_shared_base", 235>;
defm SRC_SHARED_LIMIT : SIRegLoHi16<"src_shared_limit", 236>;
@@ -237,7 +252,7 @@ def XNACK_MASK :
RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
- let HWEncoding = 104;
+ let HWEncoding = XNACK_MASK_LO.HWEncoding;
}
// Trap handler registers
@@ -247,7 +262,7 @@ defm TBA_HI : SIRegLoHi16<"tba_hi", 109>;
def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
- let HWEncoding = 108;
+ let HWEncoding = TBA_LO.HWEncoding;
}
defm TMA_LO : SIRegLoHi16<"tma_lo", 110>;
@@ -256,7 +271,7 @@ defm TMA_HI : SIRegLoHi16<"tma_hi", 111>;
def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
- let HWEncoding = 110;
+ let HWEncoding = TMA_LO.HWEncoding;
}
foreach Index = 0...15 in {
@@ -635,16 +650,16 @@ let GeneratePressureSet = 0, HasSGPR = 1 in {
// See comments in SIInstructions.td for more info.
def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
- SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
- SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
+ SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE,
+ SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
SRC_VCCZ, SRC_EXECZ, SRC_SCC)> {
let AllocationPriority = 10;
}
def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16,
(add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16,
- XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, TTMP_LO16, TMA_LO_LO16,
- TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO16,
+ XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, SGPR_NULL_HI_LO16, TTMP_LO16,
+ TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO16,
SRC_SHARED_LIMIT_LO16, SRC_PRIVATE_BASE_LO16, SRC_PRIVATE_LIMIT_LO16,
SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16)> {
let Size = 16;
@@ -701,23 +716,6 @@ def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16],
let HasSGPR = 1;
}
-// CCR (call clobbered registers) SGPR 64-bit registers
-def CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
- (add (trunc SGPR_64, 16))> {
- let CopyCost = SGPR_64.CopyCost;
- let AllocationPriority = SGPR_64.AllocationPriority;
- let HasSGPR = 1;
-}
-
-// Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC
-def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
- (add (trunc (shl SGPR_64, 15), 1), // s[30:31]
- (trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63]
- let CopyCost = SGPR_64.CopyCost;
- let AllocationPriority = SGPR_64.AllocationPriority;
- let HasSGPR = 1;
-}
-
def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
(add TTMP_64Regs)> {
let isAllocatable = 0;
@@ -725,7 +723,7 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
}
def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
- (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
+ (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, TTMP_64, TBA, TMA)> {
let CopyCost = 1;
let AllocationPriority = 13;
let HasSGPR = 1;
@@ -788,7 +786,7 @@ defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128R
defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
-defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64], SGPR_256Regs, TTMP_256Regs>;
+defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
@@ -829,7 +827,7 @@ defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
-defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
+defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>;
defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
@@ -856,21 +854,12 @@ defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)
} // End GeneratePressureSet = 0
-// This is not a real register. This is just to have a register to add
-// to VReg_1 that does not alias any real register that would
-// introduce inferred register classes.
-def ARTIFICIAL_VGPR : SIReg <"invalid vgpr", 0> {
- let isArtificial = 1;
-}
-
let GeneratePressureSet = 0 in {
-// FIXME: Should specify an empty set for this. No register should
-// ever be allocated using VReg_1. This is a hack for SelectionDAG
-// that should always be lowered by SILowerI1Copies. TableGen crashes
-// on an empty register set, but also sorts register classes based on
-// the number of registerss in them. Add only one register so this is
+// No register should ever be allocated using VReg_1. This is a hack for
+// SelectionDAG that should always be lowered by SILowerI1Copies. TableGen
+// sorts register classes based on the number of registers in them so this is
// sorted to the end and not preferred over VGPR_32.
-def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add ARTIFICIAL_VGPR)> {
+def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add)> {
let Size = 1;
let HasVGPR = 1;
}
@@ -913,11 +902,11 @@ defm AV_64 : AVRegClass<2, VReg_64.RegTypes, (add VGPR_64), (add AGPR_64)>;
defm AV_96 : AVRegClass<3, VReg_96.RegTypes, (add VGPR_96), (add AGPR_96)>;
defm AV_128 : AVRegClass<4, VReg_128.RegTypes, (add VGPR_128), (add AGPR_128)>;
defm AV_160 : AVRegClass<5, VReg_160.RegTypes, (add VGPR_160), (add AGPR_160)>;
-defm AV_192 : AVRegClass<6, VReg_160.RegTypes, (add VGPR_192), (add AGPR_192)>;
-defm AV_224 : AVRegClass<7, VReg_160.RegTypes, (add VGPR_224), (add AGPR_224)>;
-defm AV_256 : AVRegClass<8, VReg_160.RegTypes, (add VGPR_256), (add AGPR_256)>;
-defm AV_512 : AVRegClass<16, VReg_160.RegTypes, (add VGPR_512), (add AGPR_512)>;
-defm AV_1024 : AVRegClass<32, VReg_160.RegTypes, (add VGPR_1024), (add AGPR_1024)>;
+defm AV_192 : AVRegClass<6, VReg_192.RegTypes, (add VGPR_192), (add AGPR_192)>;
+defm AV_224 : AVRegClass<7, VReg_224.RegTypes, (add VGPR_224), (add AGPR_224)>;
+defm AV_256 : AVRegClass<8, VReg_256.RegTypes, (add VGPR_256), (add AGPR_256)>;
+defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>;
+defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>;
//===----------------------------------------------------------------------===//
// Register operands
@@ -1087,6 +1076,27 @@ def VRegSrc_32 : RegisterOperand<VGPR_32> {
let DecoderMethod = "DecodeVS_32RegisterClass";
}
+def VRegSrc_64 : RegisterOperand<VReg_64> {
+ let DecoderMethod = "decodeOperand_VReg_64";
+}
+
+def VRegSrc_128 : RegisterOperand<VReg_128> {
+ let DecoderMethod = "decodeOperand_VReg_128";
+}
+
+def VRegSrc_256 : RegisterOperand<VReg_256> {
+ let DecoderMethod = "decodeOperand_VReg_256";
+}
+
+//===----------------------------------------------------------------------===//
+// VGPRSrc_*
+//===----------------------------------------------------------------------===//
+
+// An 8-bit RegisterOperand wrapper for a VGPR
+def VGPRSrc_32 : RegisterOperand<VGPR_32> {
+ let DecoderMethod = "DecodeVGPR_32RegisterClass";
+}
+
//===----------------------------------------------------------------------===//
// ASrc_* Operands with an AccVGPR
//===----------------------------------------------------------------------===//
@@ -1116,7 +1126,7 @@ defm VISrc_512 : RegInlineOperandAC<"VReg", "VISrc_512", "_512">;
defm VISrc_1024 : RegInlineOperandAC<"VReg", "VISrc_1024", "_1024">;
//===----------------------------------------------------------------------===//
-// AVSrc_* Operands with an AGPR or VGPR
+// AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR
//===----------------------------------------------------------------------===//
def AVSrc_32 : RegisterOperand<AV_32> {
@@ -1129,6 +1139,21 @@ def AVSrc_64 : RegisterOperand<AV_64> {
let EncoderMethod = "getAVOperandEncoding";
}
+def AVSrc_128 : RegisterOperand<AV_128> {
+ let DecoderMethod = "DecodeAV_128RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVDst_128 : RegisterOperand<AV_128> {
+ let DecoderMethod = "DecodeAVDst_128RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVDst_512 : RegisterOperand<AV_512> {
+ let DecoderMethod = "DecodeAVDst_512RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
def AVLdSt_32 : RegisterOperand<AV_32> {
let DecoderMethod = "DecodeAVLdSt_32RegisterClass";
let EncoderMethod = "getAVOperandEncoding";
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 18d424a3bc9f..53441b5a4ced 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -59,6 +59,7 @@ def WriteIntMul : SchedWrite;
// mAI multipass instructions.
def Write2PassMAI : SchedWrite;
+def Write4PassMAI : SchedWrite;
def Write8PassMAI : SchedWrite;
def Write16PassMAI : SchedWrite;
def Write4PassDGEMM : SchedWrite;
@@ -86,7 +87,9 @@ class SISchedMachineModel : SchedMachineModel {
def SIFullSpeedModel : SISchedMachineModel;
def SIQuarterSpeedModel : SISchedMachineModel;
def SIDPFullSpeedModel : SISchedMachineModel;
+def SIDPGFX940FullSpeedModel : SISchedMachineModel;
def GFX10SpeedModel : SISchedMachineModel;
+def GFX11SpeedModel : SISchedMachineModel;
// XXX: Are the resource counts correct?
def HWBranch : ProcResource<1> {
@@ -156,6 +159,8 @@ multiclass SICommonWriteRes {
let ResourceCycles = [2] in
def : HWWriteRes<Write2PassMAI, [HWXDL], 2>;
+ let ResourceCycles = [4] in
+ def : HWWriteRes<Write4PassMAI, [HWXDL], 4>;
let ResourceCycles = [8] in
def : HWWriteRes<Write8PassMAI, [HWXDL], 8>;
let ResourceCycles = [16] in
@@ -244,6 +249,40 @@ def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>;
} // End SchedModel = SIDPFullSpeedModel
+let SchedModel = SIDPGFX940FullSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 1>;
+def : HWVALUWriteRes<WriteDouble, 1>;
+def : HWVALUWriteRes<WriteDoubleAdd, 1>;
+def : HWVALUWriteRes<WriteDoubleCvt, 1>;
+def : HWVALUWriteRes<WriteTrans64, 4>;
+def : HWVALUWriteRes<WriteIntMul, 1>;
+def : HWVALUWriteRes<Write64Bit, 1>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
+def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>;
+
+def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X8X")>;
+def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X16")>;
+def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X32")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X[14][FBI]")>;
+
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X4XF")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X8")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X16")>;
+def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X[124][FBI]")>;
+
+def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>;
+def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>;
+
+def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_16X16X")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>;
+
+} // End SchedModel = SIDPGFX940FullSpeedModel
+
let SchedModel = GFX10SpeedModel in {
// The latency values are 1 / (operations / cycle).
@@ -273,3 +312,29 @@ def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
def : InstRW<[WriteCopy], (instrs COPY)>;
} // End SchedModel = GFX10SpeedModel
+
+let SchedModel = GFX11SpeedModel in {
+
+def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
+def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
+def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>;
+def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>;
+def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>;
+def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
+def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 38>;
+def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 38>;
+def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 38>;
+def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>;
+def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 40>;
+
+def : HWWriteRes<WriteBranch, [HWBranch], 32>;
+def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
+def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>;
+def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>;
+def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
+def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
+def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
+} // End SchedModel = GFX11SpeedModel
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index c8f1daf26de9..05d2dd000162 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -26,15 +26,40 @@ using namespace llvm;
namespace {
class SIShrinkInstructions : public MachineFunctionPass {
+ MachineRegisterInfo *MRI;
+ const GCNSubtarget *ST;
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+
public:
static char ID;
- void shrinkMIMG(MachineInstr &MI);
-
public:
SIShrinkInstructions() : MachineFunctionPass(ID) {
}
+ bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const;
+ bool isKImmOperand(const MachineOperand &Src) const;
+ bool isKUImmOperand(const MachineOperand &Src) const;
+ bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
+ bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const;
+ void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
+ void shrinkScalarCompare(MachineInstr &MI) const;
+ void shrinkMIMG(MachineInstr &MI) const;
+ void shrinkMadFma(MachineInstr &MI) const;
+ bool shrinkScalarLogicOp(MachineInstr &MI) const;
+ bool tryReplaceDeadSDST(MachineInstr &MI) const;
+ bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
+ Register Reg, unsigned SubReg) const;
+ bool instReadsReg(const MachineInstr *MI, unsigned Reg,
+ unsigned SubReg) const;
+ bool instModifiesReg(const MachineInstr *MI, unsigned Reg,
+ unsigned SubReg) const;
+ TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
+ unsigned I) const;
+ void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
+ MachineInstr *matchSwap(MachineInstr &MovT) const;
+
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override { return "SI Shrink Instructions"; }
@@ -59,8 +84,8 @@ FunctionPass *llvm::createSIShrinkInstructionsPass() {
/// This function checks \p MI for operands defined by a move immediate
/// instruction and then folds the literal constant into the instruction if it
/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
-static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
- MachineRegisterInfo &MRI, bool TryToCommute = true) {
+bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
+ bool TryToCommute) const {
assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
@@ -69,8 +94,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
MachineOperand &Src0 = MI.getOperand(Src0Idx);
if (Src0.isReg()) {
Register Reg = Src0.getReg();
- if (Reg.isVirtual() && MRI.hasOneUse(Reg)) {
- MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+ if (Reg.isVirtual()) {
+ MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
if (Def && Def->isMoveImmediate()) {
MachineOperand &MovSrc = Def->getOperand(1);
bool ConstantFolded = false;
@@ -91,8 +116,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
}
if (ConstantFolded) {
- assert(MRI.use_empty(Reg));
- Def->eraseFromParent();
+ if (MRI->use_nodbg_empty(Reg))
+ Def->eraseFromParent();
++NumLiteralConstantsFolded;
return true;
}
@@ -103,7 +128,7 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
// We have failed to fold src0, so commute the instruction and try again.
if (TryToCommute && MI.isCommutable()) {
if (TII->commuteInstruction(MI)) {
- if (foldImmediates(MI, TII, MRI, false))
+ if (foldImmediates(MI, false))
return true;
// Commute back.
@@ -114,21 +139,20 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
return false;
}
-static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
return isInt<16>(Src.getImm()) &&
!TII->isInlineConstant(*Src.getParent(),
Src.getParent()->getOperandNo(&Src));
}
-static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
return isUInt<16>(Src.getImm()) &&
!TII->isInlineConstant(*Src.getParent(),
Src.getParent()->getOperandNo(&Src));
}
-static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
- const MachineOperand &Src,
- bool &IsUnsigned) {
+bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
+ bool &IsUnsigned) const {
if (isInt<16>(Src.getImm())) {
IsUnsigned = false;
return !TII->isInlineConstant(Src);
@@ -144,9 +168,8 @@ static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
/// \returns true if the constant in \p Src should be replaced with a bitreverse
/// of an inline immediate.
-static bool isReverseInlineImm(const SIInstrInfo *TII,
- const MachineOperand &Src,
- int32_t &ReverseImm) {
+bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src,
+ int32_t &ReverseImm) const {
if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
return false;
@@ -156,8 +179,9 @@ static bool isReverseInlineImm(const SIInstrInfo *TII,
/// Copy implicit register operands from specified instruction to this
/// instruction that are not part of the instruction definition.
-static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
- const MachineInstr &MI) {
+void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
+ MachineInstr &MI) const {
+ MachineFunction &MF = *MI.getMF();
for (unsigned i = MI.getDesc().getNumOperands() +
MI.getDesc().getNumImplicitUses() +
MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
@@ -168,7 +192,7 @@ static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
}
}
-static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
+void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
// cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
// get constants on the RHS.
if (!MI.getOperand(0).isReg())
@@ -191,7 +215,7 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
// and initially selected to the unsigned versions.
if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
bool HasUImm;
- if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
+ if (isKImmOrKUImmOperand(Src1, HasUImm)) {
if (!HasUImm) {
SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
@@ -205,22 +229,30 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
- if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
- (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
+ if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
+ (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
MI.setDesc(NewDesc);
}
}
// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
-void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
+void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
- if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
+ if (!Info)
return;
- MachineFunction *MF = MI.getParent()->getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ uint8_t NewEncoding;
+ switch (Info->MIMGEncoding) {
+ case AMDGPU::MIMGEncGfx10NSA:
+ NewEncoding = AMDGPU::MIMGEncGfx10Default;
+ break;
+ case AMDGPU::MIMGEncGfx11NSA:
+ NewEncoding = AMDGPU::MIMGEncGfx11Default;
+ break;
+ default:
+ return;
+ }
+
int VAddr0Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
unsigned NewAddrDwords = Info->VAddrDwords;
@@ -246,16 +278,23 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
}
unsigned VgprBase = 0;
+ unsigned NextVgpr = 0;
bool IsUndef = true;
bool IsKill = NewAddrDwords == Info->VAddrDwords;
- for (unsigned i = 0; i < Info->VAddrDwords; ++i) {
- const MachineOperand &Op = MI.getOperand(VAddr0Idx + i);
- unsigned Vgpr = TRI.getHWRegIndex(Op.getReg());
+ for (unsigned Idx = 0; Idx < Info->VAddrOperands; ++Idx) {
+ const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx);
+ unsigned Vgpr = TRI->getHWRegIndex(Op.getReg());
+ unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32;
+ assert(Dwords > 0 && "Un-implemented for less than 32 bit regs");
- if (i == 0) {
+ if (Idx == 0) {
VgprBase = Vgpr;
- } else if (VgprBase + i != Vgpr)
+ NextVgpr = Vgpr + Dwords;
+ } else if (Vgpr == NextVgpr) {
+ NextVgpr = Vgpr + Dwords;
+ } else {
return;
+ }
if (!Op.isUndef())
IsUndef = false;
@@ -288,21 +327,108 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
}
}
- unsigned NewOpcode =
- AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default,
- Info->VDataDwords, NewAddrDwords);
+ unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding,
+ Info->VDataDwords, NewAddrDwords);
MI.setDesc(TII->get(NewOpcode));
MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
MI.getOperand(VAddr0Idx).setIsKill(IsKill);
- for (unsigned i = 1; i < Info->VAddrDwords; ++i)
- MI.RemoveOperand(VAddr0Idx + 1);
+ for (int i = 1; i < Info->VAddrOperands; ++i)
+ MI.removeOperand(VAddr0Idx + 1);
if (ToUntie >= 0) {
MI.tieOperands(
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
- ToUntie - (Info->VAddrDwords - 1));
+ ToUntie - (Info->VAddrOperands - 1));
+ }
+}
+
+// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
+void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
+ if (!ST->hasVOP3Literal())
+ return;
+
+ if (TII->hasAnyModifiersSet(MI))
+ return;
+
+ const unsigned Opcode = MI.getOpcode();
+ MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
+
+ bool Swap;
+
+ // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form.
+ if (Src2.isImm() && !TII->isInlineConstant(Src2)) {
+ if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg()))
+ Swap = false;
+ else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()))
+ Swap = true;
+ else
+ return;
+
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unexpected mad/fma opcode!");
+ case AMDGPU::V_MAD_F32_e64:
+ NewOpcode = AMDGPU::V_MADAK_F32;
+ break;
+ case AMDGPU::V_FMA_F32_e64:
+ NewOpcode = AMDGPU::V_FMAAK_F32;
+ break;
+ case AMDGPU::V_MAD_F16_e64:
+ NewOpcode = AMDGPU::V_MADAK_F16;
+ break;
+ case AMDGPU::V_FMA_F16_e64:
+ NewOpcode = AMDGPU::V_FMAAK_F16;
+ break;
+ }
+ }
+
+ // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form.
+ if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) {
+ if (Src1.isImm() && !TII->isInlineConstant(Src1))
+ Swap = false;
+ else if (Src0.isImm() && !TII->isInlineConstant(Src0))
+ Swap = true;
+ else
+ return;
+
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unexpected mad/fma opcode!");
+ case AMDGPU::V_MAD_F32_e64:
+ NewOpcode = AMDGPU::V_MADMK_F32;
+ break;
+ case AMDGPU::V_FMA_F32_e64:
+ NewOpcode = AMDGPU::V_FMAMK_F32;
+ break;
+ case AMDGPU::V_MAD_F16_e64:
+ NewOpcode = AMDGPU::V_MADMK_F16;
+ break;
+ case AMDGPU::V_FMA_F16_e64:
+ NewOpcode = AMDGPU::V_FMAMK_F16;
+ break;
+ }
+ }
+
+ if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
+ return;
+
+ if (Swap) {
+ // Swap Src0 and Src1 by building a new instruction.
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode),
+ MI.getOperand(0).getReg())
+ .add(Src1)
+ .add(Src0)
+ .add(Src2)
+ .setMIFlags(MI.getFlags());
+ MI.eraseFromParent();
+ } else {
+ TII->removeModOperands(MI);
+ MI.setDesc(TII->get(NewOpcode));
}
}
@@ -311,10 +437,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
/// XNOR (as a ^ b == ~(a ^ ~b)).
/// \returns true if the caller should continue the machine function iterator
-static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
- MachineRegisterInfo &MRI,
- const SIInstrInfo *TII,
- MachineInstr &MI) {
+bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
const MachineOperand *Dest = &MI.getOperand(0);
MachineOperand *Src0 = &MI.getOperand(1);
@@ -323,7 +446,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
MachineOperand *SrcImm = Src1;
if (!SrcImm->isImm() ||
- AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm()))
+ AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm()))
return false;
uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
@@ -333,7 +456,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
if (isPowerOf2_32(~Imm)) {
NewImm = countTrailingOnes(Imm);
Opc = AMDGPU::S_BITSET0_B32;
- } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
NewImm = ~Imm;
Opc = AMDGPU::S_ANDN2_B32;
}
@@ -341,12 +464,12 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
if (isPowerOf2_32(Imm)) {
NewImm = countTrailingZeros(Imm);
Opc = AMDGPU::S_BITSET1_B32;
- } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
NewImm = ~Imm;
Opc = AMDGPU::S_ORN2_B32;
}
} else if (Opc == AMDGPU::S_XOR_B32) {
- if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
NewImm = ~Imm;
Opc = AMDGPU::S_XNOR_B32;
}
@@ -354,16 +477,10 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
llvm_unreachable("unexpected opcode");
}
- if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
- SrcImm == Src0) {
- if (!TII->commuteInstruction(MI, false, 1, 2))
- NewImm = 0;
- }
-
if (NewImm != 0) {
if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
- MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
- MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+ MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
+ MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
return true;
}
@@ -390,19 +507,19 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
// This is the same as MachineInstr::readsRegister/modifiesRegister except
// it takes subregs into account.
-static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
- Register Reg, unsigned SubReg,
- const SIRegisterInfo &TRI) {
+bool SIShrinkInstructions::instAccessReg(
+ iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg,
+ unsigned SubReg) const {
for (const MachineOperand &MO : R) {
if (!MO.isReg())
continue;
if (Reg.isPhysical() && MO.getReg().isPhysical()) {
- if (TRI.regsOverlap(Reg, MO.getReg()))
+ if (TRI->regsOverlap(Reg, MO.getReg()))
return true;
} else if (MO.getReg() == Reg && Reg.isVirtual()) {
- LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
- TRI.getSubRegIndexLaneMask(MO.getSubReg());
+ LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) &
+ TRI->getSubRegIndexLaneMask(MO.getSubReg());
if (Overlap.any())
return true;
}
@@ -410,33 +527,31 @@ static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
return false;
}
-static bool instReadsReg(const MachineInstr *MI,
- unsigned Reg, unsigned SubReg,
- const SIRegisterInfo &TRI) {
- return instAccessReg(MI->uses(), Reg, SubReg, TRI);
+bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg,
+ unsigned SubReg) const {
+ return instAccessReg(MI->uses(), Reg, SubReg);
}
-static bool instModifiesReg(const MachineInstr *MI,
- unsigned Reg, unsigned SubReg,
- const SIRegisterInfo &TRI) {
- return instAccessReg(MI->defs(), Reg, SubReg, TRI);
+bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg,
+ unsigned SubReg) const {
+ return instAccessReg(MI->defs(), Reg, SubReg);
}
-static TargetInstrInfo::RegSubRegPair
-getSubRegForIndex(Register Reg, unsigned Sub, unsigned I,
- const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
- if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
+TargetInstrInfo::RegSubRegPair
+SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub,
+ unsigned I) const {
+ if (TRI->getRegSizeInBits(Reg, *MRI) != 32) {
if (Reg.isPhysical()) {
- Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
+ Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I));
} else {
- Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub));
+ Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub));
}
}
return TargetInstrInfo::RegSubRegPair(Reg, Sub);
}
-static void dropInstructionKeepingImpDefs(MachineInstr &MI,
- const SIInstrInfo *TII) {
+void SIShrinkInstructions::dropInstructionKeepingImpDefs(
+ MachineInstr &MI) const {
for (unsigned i = MI.getDesc().getNumOperands() +
MI.getDesc().getNumImplicitUses() +
MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
@@ -464,14 +579,13 @@ static void dropInstructionKeepingImpDefs(MachineInstr &MI,
// Returns next valid instruction pointer if was able to create v_swap_b32.
//
// This shall not be done too early not to prevent possible folding which may
-// remove matched moves, and this should prefereably be done before RA to
+// remove matched moves, and this should preferably be done before RA to
// release saved registers and also possibly after RA which can insert copies
// too.
//
-// This is really just a generic peephole that is not a canocical shrinking,
+// This is really just a generic peephole that is not a canonical shrinking,
// although requirements match the pass placement and it reduces code size too.
-static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
- const SIInstrInfo *TII) {
+MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
MovT.getOpcode() == AMDGPU::COPY);
@@ -486,8 +600,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
unsigned Size = TII->getOpSize(MovT, 0) / 4;
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
- if (!TRI.isVGPR(MRI, X))
+ if (!TRI->isVGPR(*MRI, X))
return nullptr;
if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0))
@@ -501,7 +614,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
MachineInstr *MovY = &*Iter;
- KilledT = MovY->killsRegister(T, &TRI);
+ KilledT = MovY->killsRegister(T, TRI);
if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
MovY->getOpcode() != AMDGPU::COPY) ||
@@ -514,21 +627,20 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
Register Y = MovY->getOperand(0).getReg();
unsigned Ysub = MovY->getOperand(0).getSubReg();
- if (!TRI.isVGPR(MRI, Y))
+ if (!TRI->isVGPR(*MRI, Y))
continue;
MachineInstr *MovX = nullptr;
for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator());
I != IY; ++I) {
- if (instReadsReg(&*I, X, Xsub, TRI) ||
- instModifiesReg(&*I, Y, Ysub, TRI) ||
- instModifiesReg(&*I, T, Tsub, TRI) ||
- (MovX && instModifiesReg(&*I, X, Xsub, TRI))) {
+ if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) ||
+ instModifiesReg(&*I, T, Tsub) ||
+ (MovX && instModifiesReg(&*I, X, Xsub))) {
MovX = nullptr;
break;
}
- if (!instReadsReg(&*I, Y, Ysub, TRI)) {
- if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) {
+ if (!instReadsReg(&*I, Y, Ysub)) {
+ if (!MovX && instModifiesReg(&*I, X, Xsub)) {
MovX = nullptr;
break;
}
@@ -559,8 +671,8 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
for (unsigned I = 0; I < Size; ++I) {
TargetInstrInfo::RegSubRegPair X1, Y1;
- X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI);
- Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI);
+ X1 = getSubRegForIndex(X, Xsub, I);
+ Y1 = getSubRegForIndex(Y, Ysub, I);
MachineBasicBlock &MBB = *MovT.getParent();
auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
TII->get(AMDGPU::V_SWAP_B32))
@@ -570,23 +682,23 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
.addReg(X1.Reg, 0, X1.SubReg).getInstr();
if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
// Drop implicit EXEC.
- MIB->RemoveOperand(MIB->getNumExplicitOperands());
+ MIB->removeOperand(MIB->getNumExplicitOperands());
MIB->copyImplicitOps(*MBB.getParent(), *MovX);
}
}
MovX->eraseFromParent();
- dropInstructionKeepingImpDefs(*MovY, TII);
+ dropInstructionKeepingImpDefs(*MovY);
MachineInstr *Next = &*std::next(MovT.getIterator());
- if (T.isVirtual() && MRI.use_nodbg_empty(T)) {
- dropInstructionKeepingImpDefs(MovT, TII);
+ if (T.isVirtual() && MRI->use_nodbg_empty(T)) {
+ dropInstructionKeepingImpDefs(MovT);
} else {
Xop.setIsKill(false);
for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) {
unsigned OpNo = MovT.getNumExplicitOperands() + I;
const MachineOperand &Op = MovT.getOperand(OpNo);
- if (Op.isKill() && TRI.regsOverlap(X, Op.getReg()))
- MovT.RemoveOperand(OpNo);
+ if (Op.isKill() && TRI->regsOverlap(X, Op.getReg()))
+ MovT.removeOperand(OpNo);
}
}
@@ -596,14 +708,32 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
return nullptr;
}
+// If an instruction has dead sdst replace it with NULL register on gfx1030+
+bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
+ if (!ST->hasGFX10_3Insts())
+ return false;
+
+ MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+ if (!Op)
+ return false;
+ Register SDstReg = Op->getReg();
+ if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg))
+ return false;
+
+ Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64);
+ return true;
+}
+
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+ MRI = &MF.getRegInfo();
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ TII = ST->getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+
+ unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
std::vector<unsigned> I1Defs;
@@ -628,7 +758,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineOperand &Src = MI.getOperand(1);
if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
int32_t ReverseImm;
- if (isReverseInlineImm(TII, Src, ReverseImm)) {
+ if (isReverseInlineImm(Src, ReverseImm)) {
MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
Src.setImm(ReverseImm);
continue;
@@ -636,19 +766,15 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
}
}
- if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
- MI.getOpcode() == AMDGPU::COPY)) {
- if (auto *NextMI = matchSwap(MI, MRI, TII)) {
+ if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+ MI.getOpcode() == AMDGPU::COPY)) {
+ if (auto *NextMI = matchSwap(MI)) {
Next = NextMI->getIterator();
continue;
}
}
- // FIXME: We also need to consider movs of constant operands since
- // immediate operands are not folded if they have more than one use, and
- // the operand folding pass is unaware if the immediate will be free since
- // it won't know if the src == dest constraint will end up being
- // satisfied.
+ // Try to use S_ADDK_I32 and S_MULK_I32.
if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
MI.getOpcode() == AMDGPU::S_MUL_I32) {
const MachineOperand *Dest = &MI.getOperand(0);
@@ -664,13 +790,13 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// we have a vector add of a constant, we usually don't get the correct
// allocation due to the subregister usage.
if (Dest->getReg().isVirtual() && Src0->isReg()) {
- MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
- MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
+ MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
+ MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
continue;
}
if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
- if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
+ if (Src1->isImm() && isKImmOperand(*Src1)) {
unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
@@ -682,7 +808,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// Try to use s_cmpk_*
if (MI.isCompare() && TII->isSOPC(MI)) {
- shrinkScalarCompare(TII, MI);
+ shrinkScalarCompare(MI);
continue;
}
@@ -693,9 +819,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (Src.isImm() && Dst.getReg().isPhysical()) {
int32_t ReverseImm;
- if (isKImmOperand(TII, Src))
+ if (isKImmOperand(Src))
MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
- else if (isReverseInlineImm(TII, Src, ReverseImm)) {
+ else if (isReverseInlineImm(Src, ReverseImm)) {
MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
Src.setImm(ReverseImm);
}
@@ -708,47 +834,70 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
MI.getOpcode() == AMDGPU::S_OR_B32 ||
MI.getOpcode() == AMDGPU::S_XOR_B32) {
- if (shrinkScalarLogicOp(ST, MRI, TII, MI))
+ if (shrinkScalarLogicOp(MI))
continue;
}
if (TII->isMIMG(MI.getOpcode()) &&
- ST.getGeneration() >= AMDGPUSubtarget::GFX10 &&
+ ST->getGeneration() >= AMDGPUSubtarget::GFX10 &&
MF.getProperties().hasProperty(
MachineFunctionProperties::Property::NoVRegs)) {
shrinkMIMG(MI);
continue;
}
- if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
+ if (!TII->isVOP3(MI))
+ continue;
+
+ if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 ||
+ MI.getOpcode() == AMDGPU::V_FMA_F32_e64 ||
+ MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
+ MI.getOpcode() == AMDGPU::V_FMA_F16_e64) {
+ shrinkMadFma(MI);
continue;
+ }
+
+ if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
+ // If there is no chance we will shrink it and use VCC as sdst to get
+ // a 32 bit form try to replace dead sdst with NULL.
+ tryReplaceDeadSDST(MI);
+ continue;
+ }
- if (!TII->canShrink(MI, MRI)) {
+ if (!TII->canShrink(MI, *MRI)) {
// Try commuting the instruction and see if that enables us to shrink
// it.
if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
- !TII->canShrink(MI, MRI))
+ !TII->canShrink(MI, *MRI)) {
+ tryReplaceDeadSDST(MI);
continue;
+ }
}
int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
if (TII->isVOPC(Op32)) {
- Register DstReg = MI.getOperand(0).getReg();
- if (DstReg.isVirtual()) {
- // VOPC instructions can only write to the VCC register. We can't
- // force them to use VCC here, because this is only one register and
- // cannot deal with sequences which would require multiple copies of
- // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
- //
- // So, instead of forcing the instruction to write to VCC, we provide
- // a hint to the register allocator to use VCC and then we will run
- // this pass again after RA and shrink it if it outputs to VCC.
- MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
- continue;
+ MachineOperand &Op0 = MI.getOperand(0);
+ if (Op0.isReg()) {
+ // Exclude VOPCX instructions as these don't explicitly write a
+ // dst.
+ Register DstReg = Op0.getReg();
+ if (DstReg.isVirtual()) {
+ // VOPC instructions can only write to the VCC register. We can't
+ // force them to use VCC here, because this is only one register and
+ // cannot deal with sequences which would require multiple copies of
+ // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
+ //
+ // So, instead of forcing the instruction to write to VCC, we
+ // provide a hint to the register allocator to use VCC and then we
+ // will run this pass again after RA and shrink it if it outputs to
+ // VCC.
+ MRI->setRegAllocationHint(DstReg, 0, VCCReg);
+ continue;
+ }
+ if (DstReg != VCCReg)
+ continue;
}
- if (DstReg != VCCReg)
- continue;
}
if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
@@ -760,7 +909,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
Register SReg = Src2->getReg();
if (SReg.isVirtual()) {
- MRI.setRegAllocationHint(SReg, 0, VCCReg);
+ MRI->setRegAllocationHint(SReg, 0, VCCReg);
continue;
}
if (SReg != VCCReg)
@@ -776,7 +925,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (SDst->getReg() != VCCReg) {
if (SDst->getReg().isVirtual())
- MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg);
+ MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg);
Next = true;
}
@@ -786,7 +935,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
AMDGPU::OpName::src2);
if (Src2 && Src2->getReg() != VCCReg) {
if (Src2->getReg().isVirtual())
- MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
+ MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg);
Next = true;
}
@@ -801,14 +950,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
++NumInstructionsShrunk;
// Copy extra operands not present in the instruction definition.
- copyExtraImplicitOps(*Inst32, MF, MI);
+ copyExtraImplicitOps(*Inst32, MI);
// Copy deadness from the old explicit vcc def to the new implicit def.
if (SDst && SDst->isDead())
Inst32->findRegisterDefOperand(VCCReg)->setIsDead();
MI.eraseFromParent();
- foldImmediates(*Inst32, TII, MRI);
+ foldImmediates(*Inst32);
LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
}
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 46efb3c605c6..a5798afab595 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -349,8 +349,7 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
const VNInfo *NextValue = nullptr;
const VisitKey Key(Value, DefinedLanes);
- if (!Visited.count(Key)) {
- Visited.insert(Key);
+ if (Visited.insert(Key).second) {
// On first visit to a phi then start processing first predecessor
NextPredIdx = 0;
}
@@ -535,13 +534,36 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
GlobalFlags |= StateStrictWWM;
LowerToMovInstrs.push_back(&MI);
continue;
- } else if (Opcode == AMDGPU::STRICT_WQM) {
+ } else if (Opcode == AMDGPU::STRICT_WQM ||
+ TII->isDualSourceBlendEXP(MI)) {
// STRICT_WQM is similar to STRICTWWM, but instead of enabling all
// threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
// quads that have at least one active thread.
markInstructionUses(MI, StateStrictWQM, Worklist);
GlobalFlags |= StateStrictWQM;
- LowerToMovInstrs.push_back(&MI);
+
+ if (Opcode == AMDGPU::STRICT_WQM) {
+ LowerToMovInstrs.push_back(&MI);
+ } else {
+ // Dual source blend export acts as implicit strict-wqm, its sources
+ // need to be shuffled in strict wqm, but the export itself needs to
+ // run in exact mode.
+ BBI.Needs |= StateExact;
+ if (!(BBI.InNeeds & StateExact)) {
+ BBI.InNeeds |= StateExact;
+ Worklist.push_back(MBB);
+ }
+ GlobalFlags |= StateExact;
+ III.Disabled = StateWQM | StateStrict;
+ }
+ continue;
+ } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
+ Opcode == AMDGPU::LDS_DIRECT_LOAD) {
+ // Mark these STRICTWQM, but only for the instruction, not its operands.
+ // This avoid unnecessarily marking M0 as requiring WQM.
+ InstrInfo &II = Instructions[&MI];
+ II.Needs |= StateStrictWQM;
+ GlobalFlags |= StateStrictWQM;
continue;
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
@@ -969,7 +991,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
MachineInstr *WQMMaskMI = nullptr;
Register LiveMaskWQM;
if (IsDemote) {
- // Demotes deactive quads with only helper lanes
+ // Demote - deactivate quads with only helper lanes
LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
WQMMaskMI =
BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
@@ -977,7 +999,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
.addReg(Exec)
.addReg(LiveMaskWQM);
} else {
- // Kills deactivate lanes
+ // Kill - deactivate lanes no longer in live mask
if (Op.isImm()) {
unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
@@ -1453,7 +1475,7 @@ void SIWholeQuadMode::lowerCopyInstrs() {
}
int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
while (Index >= 0) {
- MI->RemoveOperand(Index);
+ MI->removeOperand(Index);
Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
}
MI->setDesc(TII->get(AMDGPU::COPY));
@@ -1468,7 +1490,7 @@ void SIWholeQuadMode::lowerCopyInstrs() {
// an undef input so it is being replaced by a simple copy.
// There should be a second undef source that we should remove.
assert(MI->getOperand(2).isUndef());
- MI->RemoveOperand(2);
+ MI->removeOperand(2);
MI->untieRegOperand(1);
} else {
assert(MI->getNumExplicitOperands() == 2);
@@ -1588,11 +1610,11 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
// Physical registers like SCC aren't tracked by default anyway, so just
// removing the ranges we computed is the simplest option for maintaining
// the analysis results.
- LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+ LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
// If we performed any kills then recompute EXEC
if (!KillInstrs.empty())
- LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
+ LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 184c871db775..882d13402a19 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -11,13 +11,19 @@ def smrd_offset_8 : NamedOperandU32<"SMRDOffset8",
let OperandType = "OPERAND_IMMEDIATE";
}
-def smem_offset : NamedOperandU32<"SMEMOffset",
- NamedMatchClass<"SMEMOffset">> {
+class SMEMOffset : NamedOperandU32<"SMEMOffset",
+ NamedMatchClass<"SMEMOffset">> {
let OperandType = "OPERAND_IMMEDIATE";
let EncoderMethod = "getSMEMOffsetEncoding";
let DecoderMethod = "decodeSMEMOffset";
}
+def smem_offset : SMEMOffset;
+
+def smem_offset_mod : SMEMOffset {
+ let PrintMethod = "printSMEMOffsetMod";
+}
+
//===----------------------------------------------------------------------===//
// Scalar Memory classes
//===----------------------------------------------------------------------===//
@@ -43,13 +49,13 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
bits<1> has_sdst = 1;
bit has_glc = 0;
bit has_dlc = 0;
- bits<1> has_offset = 1;
- bits<1> offset_is_imm = 0;
+ bit has_offset = 0;
+ bit has_soffset = 0;
bit is_buffer = 0;
}
-class SM_Real <SM_Pseudo ps>
- : InstSI<ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
+class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic>
+ : InstSI<ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands> {
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -77,20 +83,40 @@ class SM_Real <SM_Pseudo ps>
bits<7> sbase;
bits<7> sdst;
bits<32> offset;
- bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0);
+ bits<8> soffset;
bits<5> cpol;
}
-class SM_Probe_Pseudo <string opName, dag ins, bit isImm>
- : SM_Pseudo<opName, (outs), ins, " $sdata, $sbase, $offset"> {
+class OffsetMode<bit hasOffset, bit hasSOffset, string variant,
+ dag ins, string asm> {
+ bit HasOffset = hasOffset;
+ bit HasSOffset = hasSOffset;
+ string Variant = variant;
+ dag Ins = ins;
+ string Asm = asm;
+}
+
+def IMM_Offset : OffsetMode<1, 0, "_IMM", (ins smem_offset:$offset), "$offset">;
+def SGPR_Offset : OffsetMode<0, 1, "_SGPR", (ins SReg_32:$soffset), "$soffset">;
+def SGPR_IMM_Offset : OffsetMode<1, 1, "_SGPR_IMM",
+ (ins SReg_32:$soffset, smem_offset_mod:$offset),
+ "$soffset$offset">;
+
+class SM_Probe_Pseudo <string opName, string variant, RegisterClass baseClass,
+ dag offsets, string asmOffsets,
+ bit hasOffset, bit hasSOffset>
+ : SM_Pseudo<opName, (outs),
+ !con((ins i8imm:$sdata, baseClass:$sbase), offsets),
+ " $sdata, $sbase, " # asmOffsets> {
let mayLoad = 0;
let mayStore = 0;
let has_glc = 0;
let LGKM_CNT = 0;
let ScalarStore = 0;
let hasSideEffects = 1;
- let offset_is_imm = isImm;
- let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR");
+ let has_offset = hasOffset;
+ let has_soffset = hasSOffset;
+ let PseudoInstr = opName # variant;
}
class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]>
@@ -102,10 +128,11 @@ class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag>
let has_dlc = 1;
}
-class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern = []>
- : SM_Pseudo<opName, (outs), ins, asmOps, pattern> {
- RegisterClass BaseClass;
- RegisterClass SrcClass;
+class SM_Store_Pseudo <string opName, RegisterClass baseClass,
+ RegisterClass srcClass, dag ins, string asmOps>
+ : SM_Pseudo<opName, (outs), ins, asmOps, []> {
+ RegisterClass BaseClass = baseClass;
+ RegisterClass SrcClass = srcClass;
let mayLoad = 0;
let mayStore = 1;
let has_glc = 1;
@@ -113,16 +140,19 @@ class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern
let ScalarStore = 1;
}
-class SM_Discard_Pseudo <string opName, dag ins, bit isImm>
- : SM_Pseudo<opName, (outs), ins, " $sbase, $offset"> {
+class SM_Discard_Pseudo <string opName, string variant, dag offsets,
+ string asmOffsets, bit hasOffset, bit hasSOffset>
+ : SM_Pseudo<opName, (outs), !con((ins SReg_64:$sbase), offsets),
+ " $sbase, " # asmOffsets> {
let mayLoad = 0;
let mayStore = 0;
let has_glc = 0;
let has_sdst = 0;
let ScalarStore = 0;
let hasSideEffects = 1;
- let offset_is_imm = isImm;
- let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR");
+ let has_offset = hasOffset;
+ let has_soffset = hasSOffset;
+ let PseudoInstr = opName # variant;
}
multiclass SM_Pseudo_Loads<string opName,
@@ -132,7 +162,7 @@ multiclass SM_Pseudo_Loads<string opName,
(outs dstClass:$sdst),
(ins baseClass:$sbase, i32imm:$offset, CPol:$cpol),
" $sdst, $sbase, $offset$cpol", []> {
- let offset_is_imm = 1;
+ let has_offset = 1;
let BaseClass = baseClass;
let PseudoInstr = opName # "_IMM";
let has_glc = 1;
@@ -141,39 +171,63 @@ multiclass SM_Pseudo_Loads<string opName,
def _SGPR : SM_Load_Pseudo <opName,
(outs dstClass:$sdst),
- (ins baseClass:$sbase, SReg_32:$soff, CPol:$cpol),
- " $sdst, $sbase, $offset$cpol", []> {
+ (ins baseClass:$sbase, SReg_32:$soffset, CPol:$cpol),
+ " $sdst, $sbase, $soffset$cpol", []> {
+ let has_soffset = 1;
let BaseClass = baseClass;
let PseudoInstr = opName # "_SGPR";
let has_glc = 1;
let has_dlc = 1;
}
+
+ def _SGPR_IMM : SM_Load_Pseudo <opName,
+ (outs dstClass:$sdst),
+ (ins baseClass:$sbase, SReg_32:$soffset,
+ i32imm:$offset, CPol:$cpol),
+ " $sdst, $sbase, $soffset$offset$cpol", []> {
+ let has_offset = 1;
+ let has_soffset = 1;
+ let BaseClass = baseClass;
+ let PseudoInstr = opName # "_SGPR_IMM";
+ let has_glc = 1;
+ let has_dlc = 1;
+ }
}
multiclass SM_Pseudo_Stores<string opName,
RegisterClass baseClass,
RegisterClass srcClass> {
- def _IMM : SM_Store_Pseudo <opName,
+ def _IMM : SM_Store_Pseudo <opName, baseClass, srcClass,
(ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, CPol:$cpol),
- " $sdata, $sbase, $offset$cpol", []> {
- let offset_is_imm = 1;
- let BaseClass = baseClass;
- let SrcClass = srcClass;
+ " $sdata, $sbase, $offset$cpol"> {
+ let has_offset = 1;
let PseudoInstr = opName # "_IMM";
}
- def _SGPR : SM_Store_Pseudo <opName,
- (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, CPol:$cpol),
- " $sdata, $sbase, $offset$cpol", []> {
- let BaseClass = baseClass;
- let SrcClass = srcClass;
+ def _SGPR : SM_Store_Pseudo <opName, baseClass, srcClass,
+ (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soffset, CPol:$cpol),
+ " $sdata, $sbase, $soffset$cpol"> {
+ let has_soffset = 1;
let PseudoInstr = opName # "_SGPR";
}
+
+ def _SGPR_IMM : SM_Store_Pseudo <opName, baseClass, srcClass,
+ (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soffset, i32imm:$offset,
+ CPol:$cpol),
+ " $sdata, $sbase, $soffset$offset$cpol"> {
+ let has_offset = 1;
+ let has_soffset = 1;
+ let PseudoInstr = opName # "_SGPR_IMM";
+ }
}
multiclass SM_Pseudo_Discards<string opName> {
- def _IMM : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smem_offset:$offset), 1>;
- def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>;
+ def _IMM : SM_Discard_Pseudo <opName, "_IMM",
+ (ins smem_offset:$offset), "$offset", 1, 0>;
+ def _SGPR : SM_Discard_Pseudo <opName, "_SGPR",
+ (ins SReg_32:$soffset), "$soffset", 0, 1>;
+ def _SGPR_IMM : SM_Discard_Pseudo <opName, "_SGPR_IMM",
+ (ins SReg_32:$soffset, smem_offset_mod:$offset), "$soffset$offset", 1, 1>;
}
class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pseudo<
@@ -184,21 +238,24 @@ class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pse
let mayStore = 0;
let mayLoad = 0;
let has_sbase = 0;
- let has_offset = 0;
}
class SM_Inval_Pseudo <string opName, SDPatternOperator node = null_frag> : SM_Pseudo<
opName, (outs), (ins), "", [(node)]> {
let hasSideEffects = 1;
+ let mayLoad = 0;
let mayStore = 0;
let has_sdst = 0;
let has_sbase = 0;
- let has_offset = 0;
}
multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> {
- def _IMM : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smem_offset:$offset), 1>;
- def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>;
+ def _IMM : SM_Probe_Pseudo <opName, "_IMM", baseClass,
+ (ins smem_offset:$offset), "$offset", 1, 0>;
+ def _SGPR : SM_Probe_Pseudo <opName, "_SGPR", baseClass,
+ (ins SReg_32:$soffset), "$soffset", 0, 1>;
+ def _SGPR_IMM : SM_Probe_Pseudo <opName, "_SGPR_IMM", baseClass,
+ (ins SReg_32:$soffset, smem_offset_mod:$offset), "$soffset$offset", 1, 1>;
}
class SM_WaveId_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
@@ -206,9 +263,8 @@ class SM_WaveId_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
" $sdst", [(set i32:$sdst, (node))]> {
let hasSideEffects = 1;
let mayStore = 0;
- let mayLoad = 1;
+ let mayLoad = 0;
let has_sbase = 0;
- let has_offset = 0;
}
//===----------------------------------------------------------------------===//
@@ -225,6 +281,7 @@ class SM_Atomic_Pseudo <string opName,
let mayStore = 1;
let has_glc = 1;
let has_dlc = 1;
+ let has_soffset = 1;
// Should these be set?
let ScalarStore = 1;
@@ -240,21 +297,21 @@ class SM_Atomic_Pseudo <string opName,
class SM_Pseudo_Atomic<string opName,
RegisterClass baseClass,
RegisterClass dataClass,
- bit isImm,
+ OffsetMode offsets,
bit isRet,
- string opNameWithSuffix = opName # !if(isImm,
- !if(isRet, "_IMM_RTN", "_IMM"),
- !if(isRet, "_SGPR_RTN", "_SGPR")),
+ string opNameWithSuffix =
+ opName # offsets.Variant # !if(isRet, "_RTN", ""),
Operand CPolTy = !if(isRet, CPol_GLC1, CPol)> :
SM_Atomic_Pseudo<opName,
!if(isRet, (outs dataClass:$sdst), (outs)),
- !if(isImm,
- (ins dataClass:$sdata, baseClass:$sbase, smem_offset:$offset, CPolTy:$cpol),
- (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, CPolTy:$cpol)),
- !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset$cpol",
+ !con((ins dataClass:$sdata, baseClass:$sbase), offsets.Ins,
+ (ins CPolTy:$cpol)),
+ !if(isRet, " $sdst", " $sdata") #
+ ", $sbase, " # offsets.Asm # "$cpol",
isRet>,
AtomicNoRet <opNameWithSuffix, isRet> {
- let offset_is_imm = isImm;
+ let has_offset = offsets.HasOffset;
+ let has_soffset = offsets.HasSOffset;
let PseudoInstr = opNameWithSuffix;
let Constraints = !if(isRet, "$sdst = $sdata", "");
@@ -264,10 +321,12 @@ class SM_Pseudo_Atomic<string opName,
multiclass SM_Pseudo_Atomics<string opName,
RegisterClass baseClass,
RegisterClass dataClass> {
- def _IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 0>;
- def _SGPR : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 0>;
- def _IMM_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 1>;
- def _SGPR_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 1>;
+ def _IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, IMM_Offset, 0>;
+ def _SGPR : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_Offset, 0>;
+ def _SGPR_IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_IMM_Offset, 0>;
+ def _IMM_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, IMM_Offset, 1>;
+ def _SGPR_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_Offset, 1>;
+ def _SGPR_IMM_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_IMM_Offset, 1>;
}
//===----------------------------------------------------------------------===//
@@ -452,16 +511,14 @@ class SMRD_Real_si <bits<5> op, SM_Pseudo ps>
let AssemblerPredicate = isGFX6GFX7;
let DecoderNamespace = "GFX6GFX7";
- let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?);
- let Inst{8} = imm;
+ let Inst{7-0} = !if(ps.has_offset, offset{7-0}, !if(ps.has_soffset, soffset, ?));
+ let Inst{8} = ps.has_offset;
let Inst{14-9} = !if(ps.has_sbase, sbase{6-1}, ?);
let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?);
let Inst{26-22} = op;
let Inst{31-27} = 0x18; //encoding
}
-// FIXME: Assembler should reject trying to use glc on SMRD
-// instructions on SI.
multiclass SM_Real_Loads_si<bits<5> op, string ps,
SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
@@ -470,10 +527,8 @@ multiclass SM_Real_Loads_si<bits<5> op, string ps,
let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, CPol:$cpol);
}
- // FIXME: The operand name $offset is inconsistent with $soff used
- // in the pseudo
def _SGPR_si : SMRD_Real_si <op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol);
}
}
@@ -494,42 +549,82 @@ def S_DCACHE_INV_si : SMRD_Real_si <0x1f, S_DCACHE_INV>;
//===----------------------------------------------------------------------===//
-// VI
+// VI and GFX9.
//===----------------------------------------------------------------------===//
class SMEM_Real_vi <bits<8> op, SM_Pseudo ps>
: SM_Real<ps>
, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI>
, Enc64 {
- let AssemblerPredicate = isGFX8GFX9;
+ field bit IsGFX9SpecificEncoding = false;
+ let AssemblerPredicate = !if(IsGFX9SpecificEncoding, isGFX9Only, isGFX8GFX9);
let DecoderNamespace = "GFX8";
let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
+ // Note that for GFX9 instructions with immediate offsets, soffset_en
+ // must be defined, whereas in GFX8 it's undefined in all cases,
+ // meaning GFX9 is not perfectly backward-compatible with GFX8, despite
+ // documentation suggesting otherwise.
+ field bit SOffsetEn = !if(IsGFX9SpecificEncoding,
+ !if(ps.has_offset, ps.has_soffset, !if(ps.has_soffset, 0, ?)),
+ ?);
+ let Inst{14} = SOffsetEn;
+
let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?);
- let Inst{17} = imm;
+
+ // imm
+ // TODO: Shall not be defined if the instruction has no offset nor
+ // soffset.
+ let Inst{17} = ps.has_offset;
+
let Inst{25-18} = op;
let Inst{31-26} = 0x30; //encoding
// VI supports 20-bit unsigned offsets while GFX9+ supports 21-bit signed.
// Offset value is corrected accordingly when offset is encoded/decoded.
- let Inst{38-32} = !if(ps.has_offset, offset{6-0}, ?);
- let Inst{52-39} = !if(ps.has_offset, !if(imm, offset{20-7}, ?), ?);
+ // TODO: Forbid non-M0 register offsets for GFX8 stores and atomics.
+ field bits<21> Offset;
+ let Offset{6-0} = !if(ps.has_offset, offset{6-0},
+ !if(ps.has_soffset, soffset{6-0}, ?));
+ let Offset{20-7} = !if(ps.has_offset, offset{20-7}, ?);
+ let Inst{52-32} = Offset;
+
+ // soffset
+ let Inst{63-57} = !if(!and(IsGFX9SpecificEncoding, ps.has_soffset),
+ soffset{6-0}, ?);
}
-multiclass SM_Real_Loads_vi<bits<8> op, string ps,
- SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
- SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
- def _IMM_vi : SMEM_Real_vi <op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
- }
- def _SGPR_vi : SMEM_Real_vi <op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
- }
+class SMEM_Real_Load_vi<bits<8> op, string ps, dag offsets>
+ : SMEM_Real_vi<op, !cast<SM_Pseudo>(ps)> {
+ RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps).BaseClass;
+ let InOperandList = !con((ins BaseClass:$sbase), offsets, (ins CPol:$cpol));
}
-class SMEM_Real_Store_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> {
+// The alternative GFX9 SGPR encoding using soffset to encode the
+// offset register. Not available in assembler and goes to the GFX9
+// encoding family to avoid conflicts with the primary SGPR variant.
+class SMEM_Real_SGPR_alt_gfx9 {
+ bit IsGFX9SpecificEncoding = true;
+ bit SOffsetEn = 1;
+ bit Offset = ?;
+ int Subtarget = SIEncodingFamily.GFX9;
+ string AsmVariantName = "NonParsable";
+}
+
+multiclass SM_Real_Loads_vi<bits<8> op, string ps> {
+ def _IMM_vi : SMEM_Real_Load_vi <op, ps#"_IMM", (ins smem_offset:$offset)>;
+ def _SGPR_vi : SMEM_Real_Load_vi <op, ps#"_SGPR", (ins SReg_32:$soffset)>;
+ def _SGPR_alt_gfx9 : SMEM_Real_Load_vi <op, ps#"_SGPR",
+ (ins SReg_32:$soffset)>,
+ SMEM_Real_SGPR_alt_gfx9;
+ let IsGFX9SpecificEncoding = true in
+ def _SGPR_IMM_gfx9 : SMEM_Real_Load_vi <
+ op, ps#"_SGPR_IMM", (ins SReg_32:$soffset, smem_offset_mod:$offset)>;
+}
+
+class SMEM_Real_Store_Base_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> {
// encoding
bits<7> sdata;
@@ -537,23 +632,34 @@ class SMEM_Real_Store_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> {
let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?);
}
-multiclass SM_Real_Stores_vi<bits<8> op, string ps,
- SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM),
- SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> {
- // FIXME: The operand name $offset is inconsistent with $soff used
- // in the pseudo
- def _IMM_vi : SMEM_Real_Store_vi <op, immPs> {
- let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
- }
+class SMEM_Real_Store_vi <bits<8> op, string ps, dag offsets>
+ : SMEM_Real_Store_Base_vi <op, !cast<SM_Pseudo>(ps)> {
+ RegisterClass SrcClass = !cast<SM_Store_Pseudo>(ps).SrcClass;
+ RegisterClass BaseClass = !cast<SM_Store_Pseudo>(ps).BaseClass;
+ let InOperandList = !con((ins SrcClass:$sdata, BaseClass:$sbase),
+ offsets, (ins CPol:$cpol));
+}
- def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> {
- let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
- }
+multiclass SM_Real_Stores_vi<bits<8> op, string ps> {
+ def _IMM_vi : SMEM_Real_Store_vi <op, ps#_IMM, (ins smem_offset:$offset)>;
+ def _SGPR_vi : SMEM_Real_Store_vi <op, ps#_SGPR, (ins SReg_32:$soffset)>;
+ def _SGPR_alt_gfx9 : SMEM_Real_Store_vi <op, ps#"_SGPR",
+ (ins SReg_32:$soffset)>,
+ SMEM_Real_SGPR_alt_gfx9;
+ let IsGFX9SpecificEncoding = true in
+ def _SGPR_IMM_gfx9 : SMEM_Real_Store_vi <
+ op, ps#"_SGPR_IMM", (ins SReg_32:$soffset, smem_offset_mod:$offset)>;
}
multiclass SM_Real_Probe_vi<bits<8> op, string ps> {
- def _IMM_vi : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
- def _SGPR_vi : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>;
+ def _IMM_vi : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
+ def _SGPR_vi : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>;
+ def _SGPR_alt_gfx9
+ : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>,
+ SMEM_Real_SGPR_alt_gfx9;
+ let IsGFX9SpecificEncoding = true in
+ def _SGPR_IMM_gfx9
+ : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>;
}
defm S_LOAD_DWORD : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">;
@@ -614,8 +720,20 @@ class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps>
multiclass SM_Real_Atomics_vi<bits<8> op, string ps> {
def _IMM_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>;
def _SGPR_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>;
+ def _SGPR_alt_gfx9
+ : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>,
+ SMEM_Real_SGPR_alt_gfx9;
+ let IsGFX9SpecificEncoding = true in
+ def _SGPR_IMM_gfx9
+ : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM)>;
def _IMM_RTN_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>;
def _SGPR_RTN_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>;
+ def _SGPR_RTN_alt_gfx9
+ : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>,
+ SMEM_Real_SGPR_alt_gfx9;
+ let IsGFX9SpecificEncoding = true in
+ def _SGPR_IMM_RTN_gfx9
+ : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM_RTN)>;
}
defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_vi <0x40, "S_BUFFER_ATOMIC_SWAP">;
@@ -677,6 +795,10 @@ defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0xac, "S_ATOMIC_DEC_X2">
multiclass SM_Real_Discard_vi<bits<8> op, string ps> {
def _IMM_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_IMM)>;
def _SGPR_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>;
+ def _SGPR_alt_gfx9 : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>,
+ SMEM_Real_SGPR_alt_gfx9;
+ let IsGFX9SpecificEncoding = true in
+ def _SGPR_IMM_gfx9 : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR_IMM)>;
}
defm S_DCACHE_DISCARD : SM_Real_Discard_vi <0x28, "S_DCACHE_DISCARD">;
@@ -727,8 +849,8 @@ class SMRD_Real_ci <bits<5> op, SM_Pseudo ps>
let AssemblerPredicate = isGFX7Only;
let DecoderNamespace = "GFX7";
- let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?);
- let Inst{8} = imm;
+ let Inst{7-0} = !if(ps.has_offset, offset{7-0}, !if(ps.has_soffset, soffset, ?));
+ let Inst{8} = ps.has_offset;
let Inst{14-9} = !if(ps.has_sbase, sbase{6-1}, ?);
let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?);
let Inst{26-22} = op;
@@ -876,20 +998,27 @@ def : GCNPat <
// GFX10.
//===----------------------------------------------------------------------===//
-class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> :
- SM_Real<ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10>, Enc64 {
- let AssemblerPredicate = isGFX10Plus;
- let DecoderNamespace = "GFX10";
-
+class SMEM_Real_10Plus_common<bits<8> op, SM_Pseudo ps, string opName,
+ int subtarget, RegisterWithSubRegs sgpr_null> :
+ SM_Real<ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>, Enc64 {
let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
- let Inst{14} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ?);
- let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?);
let Inst{25-18} = op;
let Inst{31-26} = 0x3d;
- let Inst{52-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{20-0}, ?), ?);
- let Inst{63-57} = !if(ps.offset_is_imm, !cast<int>(SGPR_NULL.HWEncoding),
- !if(ps.has_offset, offset{6-0}, ?));
+ // There are SMEM instructions that do not employ any of the offset
+ // fields, in which case we need them to remain undefined.
+ let Inst{52-32} = !if(ps.has_offset, offset{20-0}, !if(ps.has_soffset, 0, ?));
+ let Inst{63-57} = !if(ps.has_soffset, soffset{6-0},
+ !if(ps.has_offset, sgpr_null.HWEncoding{6-0}, ?));
+}
+
+class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps>
+ : SMEM_Real_10Plus_common<op, ps, ps.Mnemonic, SIEncodingFamily.GFX10,
+ SGPR_NULL_gfxpre11> {
+ let AssemblerPredicate = isGFX10Only;
+ let DecoderNamespace = "GFX10";
+ let Inst{14} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ?);
+ let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?);
}
multiclass SM_Real_Loads_gfx10<bits<8> op, string ps,
@@ -899,7 +1028,11 @@ multiclass SM_Real_Loads_gfx10<bits<8> op, string ps,
let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
}
def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol);
+ }
+ def _SGPR_IMM_gfx10 : SMEM_Real_gfx10<op, !cast<SM_Load_Pseudo>(ps#_SGPR_IMM)> {
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset,
+ smem_offset_mod:$offset, CPol:$cpol);
}
}
@@ -913,14 +1046,17 @@ class SMEM_Real_Store_gfx10<bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx10<op, ps>
multiclass SM_Real_Stores_gfx10<bits<8> op, string ps,
SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM),
SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> {
- // FIXME: The operand name $offset is inconsistent with $soff used
- // in the pseudo
def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> {
let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
}
def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> {
- let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
+ let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol);
+ }
+
+ def _SGPR_IMM_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Store_Pseudo>(ps#_SGPR_IMM)> {
+ let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase,
+ SReg_32:$soffset, smem_offset_mod:$offset, CPol:$cpol);
}
}
@@ -969,6 +1105,8 @@ def S_DCACHE_WB_gfx10 : SMEM_Real_gfx10<0x021, S_DCACHE_WB>;
multiclass SM_Real_Probe_gfx10<bits<8> op, string ps> {
def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>;
def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>;
+ def _SGPR_IMM_gfx10
+ : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR_IMM)>;
}
defm S_ATC_PROBE : SM_Real_Probe_gfx10 <0x26, "S_ATC_PROBE">;
@@ -992,8 +1130,10 @@ class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps>
multiclass SM_Real_Atomics_gfx10<bits<8> op, string ps> {
def _IMM_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>;
def _SGPR_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>;
+ def _SGPR_IMM_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM)>;
def _IMM_RTN_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>;
def _SGPR_RTN_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>;
+ def _SGPR_IMM_RTN_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM_RTN)>;
}
let SubtargetPredicate = HasScalarAtomics in {
@@ -1057,6 +1197,7 @@ defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0xac, "S_ATOMIC_DEC_X
multiclass SM_Real_Discard_gfx10<bits<8> op, string ps> {
def _IMM_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>;
def _SGPR_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>;
+ def _SGPR_IMM_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR_IMM)>;
}
defm S_DCACHE_DISCARD : SM_Real_Discard_gfx10 <0x28, "S_DCACHE_DISCARD">;
@@ -1072,3 +1213,64 @@ def SMInfoTable : GenericTable {
let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getSMEMOpcodeHelper";
}
+
+//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+class SMEM_Real_gfx11<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
+ SMEM_Real_10Plus_common<op, ps, opName, SIEncodingFamily.GFX11,
+ SGPR_NULL_gfx11plus> {
+ let AssemblerPredicate = isGFX11Plus;
+ let DecoderNamespace = "GFX11";
+ let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, 0);
+}
+
+class SMEM_Real_Load_gfx11<bits<8> op, string ps, string opName, dag offsets> :
+ SMEM_Real_gfx11<op, !cast<SM_Pseudo>(ps), opName> {
+ RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps).BaseClass;
+ let InOperandList = !con((ins BaseClass:$sbase), offsets, (ins CPol:$cpol));
+}
+
+multiclass SM_Real_Loads_gfx11<bits<8> op, string ps, string opName> {
+ def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_IMM", opName, (ins smem_offset:$offset)>;
+ def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_SGPR", opName, (ins SReg_32:$soffset)>;
+ def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11<
+ op, ps#"_SGPR_IMM", opName, (ins SReg_32:$soffset, smem_offset_mod:$offset)>;
+ def : MnemonicAlias<!cast<SM_Pseudo>(ps#"_IMM").Mnemonic, opName>,
+ Requires<[isGFX11Plus]>;
+}
+
+defm S_LOAD_B32 : SM_Real_Loads_gfx11<0x000, "S_LOAD_DWORD", "s_load_b32">;
+defm S_LOAD_B64 : SM_Real_Loads_gfx11<0x001, "S_LOAD_DWORDX2", "s_load_b64">;
+defm S_LOAD_B128 : SM_Real_Loads_gfx11<0x002, "S_LOAD_DWORDX4", "s_load_b128">;
+defm S_LOAD_B256 : SM_Real_Loads_gfx11<0x003, "S_LOAD_DWORDX8", "s_load_b256">;
+defm S_LOAD_B512 : SM_Real_Loads_gfx11<0x004, "S_LOAD_DWORDX16", "s_load_b512">;
+
+defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx11<0x008, "S_BUFFER_LOAD_DWORD", "s_buffer_load_b32">;
+defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx11<0x009, "S_BUFFER_LOAD_DWORDX2", "s_buffer_load_b64">;
+defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx11<0x00a, "S_BUFFER_LOAD_DWORDX4", "s_buffer_load_b128">;
+defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx11<0x00b, "S_BUFFER_LOAD_DWORDX8", "s_buffer_load_b256">;
+defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx11<0x00c, "S_BUFFER_LOAD_DWORDX16", "s_buffer_load_b512">;
+
+def S_GL1_INV_gfx11 : SMEM_Real_gfx11<0x020, S_GL1_INV>;
+def S_DCACHE_INV_gfx11 : SMEM_Real_gfx11<0x021, S_DCACHE_INV>;
+
+class SMEM_Real_Store_gfx11 <bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx11<op, ps> {
+ // encoding
+ bits<7> sdata;
+
+ let sdst = ?;
+ let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?);
+}
+
+multiclass SM_Real_Probe_gfx11<bits<8> op, string ps> {
+ def _IMM_gfx11 : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
+ def _SGPR_gfx11 : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>;
+ def _SGPR_IMM_gfx11
+ : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>;
+}
+
+defm S_ATC_PROBE : SM_Real_Probe_gfx11 <0x22, "S_ATC_PROBE">;
+defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx11 <0x23, "S_ATC_PROBE_BUFFER">;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 3f7837f7dbf1..37d20045adb5 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -152,8 +152,8 @@ class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
}
// 64-bit input, no output
-class SOP1_1 <string opName, RegisterClass rc = SReg_64, list<dag> pattern=[]> : SOP1_Pseudo <
- opName, (outs), (ins rc:$src0), "$src0", pattern> {
+class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs), (ins SReg_64:$src0), "$src0", pattern> {
let has_sdst = 0;
}
@@ -235,10 +235,10 @@ def : GCNPat <
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def S_BREV_B32 : SOP1_32 <"s_brev_b32",
- [(set i32:$sdst, (bitreverse i32:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<bitreverse> i32:$src0))]
>;
def S_BREV_B64 : SOP1_64 <"s_brev_b64",
- [(set i64:$sdst, (bitreverse i64:$src0))]
+ [(set i64:$sdst, (UniformUnaryFrag<bitreverse> i64:$src0))]
>;
} // End isReMaterializable = 1, isAsCheapAsAMove = 1
@@ -276,10 +276,10 @@ def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
>;
def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">;
def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",
- [(set i32:$sdst, (sext_inreg i32:$src0, i8))]
+ [(set i32:$sdst, (UniformSextInreg<i8> i32:$src0))]
>;
def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16",
- [(set i32:$sdst, (sext_inreg i32:$src0, i16))]
+ [(set i32:$sdst, (UniformSextInreg<i16> i32:$src0))]
>;
} // End isReMaterializable = 1
@@ -300,8 +300,7 @@ def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">;
let isReturn = 1 in {
// Define variant marked as return rather than branch.
-def S_SETPC_B64_return : SOP1_1<"", CCR_SGPR_64, [(AMDGPUret_flag i64:$src0)]>;
-def S_SETPC_B64_return_gfx : SOP1_1<"", Gfx_CCR_SGPR_64, [(AMDGPUret_gfx_flag i64:$src0)]>;
+def S_SETPC_B64_return : SOP1_1<"">;
}
} // End isTerminator = 1, isBarrier = 1
@@ -341,7 +340,7 @@ def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
let Defs = [SCC] in {
def S_ABS_I32 : SOP1_32 <"s_abs_i32",
- [(set i32:$sdst, (abs i32:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<abs> i32:$src0))]
>;
} // End Defs = [SCC]
@@ -385,6 +384,21 @@ let SubtargetPredicate = isGFX10Plus in {
} // End Uses = [M0]
} // End SubtargetPredicate = isGFX10Plus
+let SubtargetPredicate = isGFX11Plus in {
+ let hasSideEffects = 1 in {
+ // For s_sendmsg_rtn_* the src0 field encodes the message type directly; it
+ // is not an SGPR number.
+ def S_SENDMSG_RTN_B32 : SOP1_Pseudo<
+ "s_sendmsg_rtn_b32", (outs SReg_32:$sdst), (ins SendMsgImm:$src0),
+ "$sdst, $src0", [(set i32:$sdst, (int_amdgcn_s_sendmsg_rtn timm:$src0))]
+ >;
+ def S_SENDMSG_RTN_B64 : SOP1_Pseudo<
+ "s_sendmsg_rtn_b64", (outs SReg_64:$sdst), (ins SendMsgImm:$src0),
+ "$sdst, $src0", [(set i64:$sdst, (int_amdgcn_s_sendmsg_rtn timm:$src0))]
+ >;
+ }
+} // End SubtargetPredicate = isGFX11Plus
+
//===----------------------------------------------------------------------===//
// SOP2 Instructions
//===----------------------------------------------------------------------===//
@@ -690,6 +704,10 @@ let SubtargetPredicate = isGFX9Plus in {
} // End isCommutable = 1, isReMaterializable = 1
} // End SubtargetPredicate = isGFX9Plus
+let SubtargetPredicate = isGFX11Plus in {
+ def S_PACK_HL_B32_B16 : SOP2_32<"s_pack_hl_b32_b16">;
+} // End SubtargetPredicate = isGFX11Plus
+
//===----------------------------------------------------------------------===//
// SOPK Instructions
//===----------------------------------------------------------------------===//
@@ -855,9 +873,7 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
"$sdst, $simm16"
>;
-let mayLoad = 1 in {
-// s_getreg_b32 should use hasSideEffects = 1 for tablegen to allow
-// its use in the readcyclecounter selection.
+// This is hasSideEffects to allow its use in readcyclecounter selection.
// FIXME: Need to truncate immediate to 16-bits.
def S_GETREG_B32 : SOPK_Pseudo <
"s_getreg_b32",
@@ -867,7 +883,6 @@ def S_GETREG_B32 : SOPK_Pseudo <
let SOPKZext = 1;
let hasSideEffects = 1;
}
-} // End mayLoad = 1
let Defs = [MODE], Uses = [MODE] in {
@@ -1169,12 +1184,12 @@ def S_ENDPGM_SAVED : SOPP_Pseudo<"s_endpgm_saved", (ins)> {
let isReturn = 1;
}
-let SubtargetPredicate = isGFX9Plus in {
+let SubtargetPredicate = isGFX9GFX10 in {
let isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1 in {
def S_ENDPGM_ORDERED_PS_DONE :
SOPP_Pseudo<"s_endpgm_ordered_ps_done", (ins)>;
} // End isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1
-} // End SubtargetPredicate = isGFX9Plus
+} // End SubtargetPredicate = isGFX9GFX10
let SubtargetPredicate = isGFX10Plus in {
let isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1 in {
@@ -1279,15 +1294,21 @@ def S_SLEEP : SOPP_Pseudo <"s_sleep", (ins i32imm:$simm16),
let hasSideEffects = 1;
}
-def S_SETPRIO : SOPP_Pseudo <"s_setprio" , (ins i16imm:$simm16), "$simm16">;
+def S_SETPRIO : SOPP_Pseudo <"s_setprio", (ins i16imm:$simm16), "$simm16",
+ [(int_amdgcn_s_setprio timm:$simm16)]> {
+ let hasSideEffects = 1;
+}
let Uses = [EXEC, M0] in {
-// FIXME: Should this be mayLoad+mayStore?
def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsgImm:$simm16), "$simm16",
- [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]>;
+ [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]> {
+ let hasSideEffects = 1;
+}
def S_SENDMSGHALT : SOPP_Pseudo <"s_sendmsghalt" , (ins SendMsgImm:$simm16), "$simm16",
- [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]>;
+ [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]> {
+ let hasSideEffects = 1;
+}
} // End Uses = [EXEC, M0]
@@ -1341,7 +1362,7 @@ let SubtargetPredicate = isGFX10Plus in {
let fixed_imm = 1;
}
def S_WAITCNT_DEPCTR :
- SOPP_Pseudo <"s_waitcnt_depctr" , (ins s16imm:$simm16), "$simm16">;
+ SOPP_Pseudo <"s_waitcnt_depctr" , (ins DepCtrImm:$simm16), "$simm16">;
let hasSideEffects = 0, Uses = [MODE], Defs = [MODE] in {
def S_ROUND_MODE :
@@ -1355,6 +1376,13 @@ let SubtargetPredicate = isGFX10Plus in {
SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16">;
} // End SubtargetPredicate = isGFX10Plus
+let SubtargetPredicate = isGFX11Plus in {
+ def S_WAIT_EVENT : SOPP_Pseudo<"s_wait_event", (ins s16imm:$simm16),
+ "$simm16">;
+ def S_DELAY_ALU : SOPP_Pseudo<"s_delay_alu", (ins DELAY_FLAG:$simm16),
+ "$simm16">;
+} // End SubtargetPredicate = isGFX11Plus
+
//===----------------------------------------------------------------------===//
// SOP1 Patterns
//===----------------------------------------------------------------------===//
@@ -1377,7 +1405,7 @@ def : GCNPat <
>;
def : GCNPat <
- (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
+ (i32 (UniformBinFrag<smax> i32:$x, (i32 (ineg i32:$x)))),
(S_ABS_I32 SReg_32:$x)
>;
@@ -1408,7 +1436,7 @@ def : GCNPat <
// REG_SEQUENCE patterns don't support instructions with multiple
// outputs.
def : GCNPat<
- (i64 (zext i16:$src)),
+ (i64 (UniformUnaryFrag<zext> i16:$src)),
(REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0,
(S_MOV_B32 (i32 0)), sub1)
@@ -1421,7 +1449,7 @@ def : GCNPat <
>;
def : GCNPat<
- (i32 (zext i16:$src)),
+ (i32 (UniformUnaryFrag<zext> i16:$src)),
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
>;
@@ -1448,8 +1476,13 @@ def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
+class Select_gfx11<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX11> {
+ Predicate AssemblerPredicate = isGFX11Only;
+ string DecoderNamespace = "GFX11";
+}
+
class Select_gfx10<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX10> {
- Predicate AssemblerPredicate = isGFX10Plus;
+ Predicate AssemblerPredicate = isGFX10Only;
string DecoderNamespace = "GFX10";
}
@@ -1464,6 +1497,87 @@ class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
}
//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+multiclass SOP1_Real_gfx11<bits<8> op> {
+ def _gfx11 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
+ Select_gfx11<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOP1_Real_Renamed_gfx11<bits<8> op, SOP1_Pseudo backing_pseudo, string real_name> {
+ def _gfx11 : SOP1_Real<op, backing_pseudo, real_name>,
+ Select_gfx11<backing_pseudo.Mnemonic>,
+ MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+}
+
+defm S_MOV_B32 : SOP1_Real_gfx11<0x000>;
+defm S_MOV_B64 : SOP1_Real_gfx11<0x001>;
+defm S_CMOV_B32 : SOP1_Real_gfx11<0x002>;
+defm S_CMOV_B64 : SOP1_Real_gfx11<0x003>;
+defm S_BREV_B32 : SOP1_Real_gfx11<0x004>;
+defm S_BREV_B64 : SOP1_Real_gfx11<0x005>;
+defm S_CTZ_I32_B32 : SOP1_Real_Renamed_gfx11<0x008, S_FF1_I32_B32, "s_ctz_i32_b32">;
+defm S_CTZ_I32_B64 : SOP1_Real_Renamed_gfx11<0x009, S_FF1_I32_B64, "s_ctz_i32_b64">;
+defm S_CLZ_I32_U32 : SOP1_Real_Renamed_gfx11<0x00a, S_FLBIT_I32_B32, "s_clz_i32_u32">;
+defm S_CLZ_I32_U64 : SOP1_Real_Renamed_gfx11<0x00b, S_FLBIT_I32_B64, "s_clz_i32_u64">;
+defm S_CLS_I32 : SOP1_Real_Renamed_gfx11<0x00c, S_FLBIT_I32, "s_cls_i32">;
+defm S_CLS_I32_I64 : SOP1_Real_Renamed_gfx11<0x00d, S_FLBIT_I32_I64, "s_cls_i32_i64">;
+defm S_SEXT_I32_I8 : SOP1_Real_gfx11<0x00e>;
+defm S_SEXT_I32_I16 : SOP1_Real_gfx11<0x00f>;
+defm S_BITSET0_B32 : SOP1_Real_gfx11<0x010>;
+defm S_BITSET0_B64 : SOP1_Real_gfx11<0x011>;
+defm S_BITSET1_B32 : SOP1_Real_gfx11<0x012>;
+defm S_BITSET1_B64 : SOP1_Real_gfx11<0x013>;
+defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx11<0x014>;
+defm S_ABS_I32 : SOP1_Real_gfx11<0x015>;
+defm S_BCNT0_I32_B32 : SOP1_Real_gfx11<0x016>;
+defm S_BCNT0_I32_B64 : SOP1_Real_gfx11<0x017>;
+defm S_BCNT1_I32_B32 : SOP1_Real_gfx11<0x018>;
+defm S_BCNT1_I32_B64 : SOP1_Real_gfx11<0x019>;
+defm S_QUADMASK_B32 : SOP1_Real_gfx11<0x01a>;
+defm S_QUADMASK_B64 : SOP1_Real_gfx11<0x01b>;
+defm S_WQM_B32 : SOP1_Real_gfx11<0x01c>;
+defm S_WQM_B64 : SOP1_Real_gfx11<0x01d>;
+defm S_NOT_B32 : SOP1_Real_gfx11<0x01e>;
+defm S_NOT_B64 : SOP1_Real_gfx11<0x01f>;
+defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx11<0x020>;
+defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx11<0x021>;
+defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x022>;
+defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x023>;
+defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x024>;
+defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x025>;
+defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx11<0x026>;
+defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx11<0x027>;
+defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x028>;
+defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x029>;
+defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x02a>;
+/*defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x02b>; //same as older arch, handled there*/
+defm S_AND_NOT0_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x02c, S_ANDN1_SAVEEXEC_B32, "s_and_not0_saveexec_b32">;
+defm S_AND_NOT0_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x02d, S_ANDN1_SAVEEXEC_B64, "s_and_not0_saveexec_b64">;
+defm S_OR_NOT0_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x02e, S_ORN1_SAVEEXEC_B32, "s_or_not0_saveexec_b32">;
+defm S_OR_NOT0_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x02f, S_ORN1_SAVEEXEC_B64, "s_or_not0_saveexec_b64">;
+defm S_AND_NOT1_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x030, S_ANDN2_SAVEEXEC_B32, "s_and_not1_saveexec_b32">;
+defm S_AND_NOT1_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x031, S_ANDN2_SAVEEXEC_B64, "s_and_not1_saveexec_b64">;
+defm S_OR_NOT1_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x032, S_ORN2_SAVEEXEC_B32, "s_or_not1_saveexec_b32">;
+defm S_OR_NOT1_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x033, S_ORN2_SAVEEXEC_B64, "s_or_not1_saveexec_b64">;
+defm S_AND_NOT0_WREXEC_B32 : SOP1_Real_Renamed_gfx11<0x034, S_ANDN1_WREXEC_B32, "s_and_not0_wrexec_b32">;
+defm S_AND_NOT0_WREXEC_B64 : SOP1_Real_Renamed_gfx11<0x035, S_ANDN1_WREXEC_B64, "s_and_not0_wrexec_b64">;
+defm S_AND_NOT1_WREXEC_B32 : SOP1_Real_Renamed_gfx11<0x036, S_ANDN2_WREXEC_B32, "s_and_not1_wrexec_b32">;
+defm S_AND_NOT1_WREXEC_B64 : SOP1_Real_Renamed_gfx11<0x037, S_ANDN2_WREXEC_B64, "s_and_not1_wrexec_b64">;
+defm S_MOVRELS_B32 : SOP1_Real_gfx11<0x040>;
+defm S_MOVRELS_B64 : SOP1_Real_gfx11<0x041>;
+defm S_MOVRELD_B32 : SOP1_Real_gfx11<0x042>;
+defm S_MOVRELD_B64 : SOP1_Real_gfx11<0x043>;
+defm S_MOVRELSD_2_B32 : SOP1_Real_gfx11<0x044>;
+defm S_GETPC_B64 : SOP1_Real_gfx11<0x047>;
+defm S_SETPC_B64 : SOP1_Real_gfx11<0x048>;
+defm S_SWAPPC_B64 : SOP1_Real_gfx11<0x049>;
+defm S_RFE_B64 : SOP1_Real_gfx11<0x04a>;
+defm S_SENDMSG_RTN_B32 : SOP1_Real_gfx11<0x04c>;
+defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11<0x04d>;
+
+//===----------------------------------------------------------------------===//
// SOP1 - GFX10.
//===----------------------------------------------------------------------===//
@@ -1473,6 +1587,9 @@ multiclass SOP1_Real_gfx10<bits<8> op> {
Select_gfx10<ps.Mnemonic>;
}
+multiclass SOP1_Real_gfx10_gfx11<bits<8> op> :
+ SOP1_Real_gfx10<op>, SOP1_Real_gfx11<op>;
+
defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x037>;
defm S_ORN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x038>;
defm S_ANDN1_WREXEC_B64 : SOP1_Real_gfx10<0x039>;
@@ -1493,7 +1610,7 @@ defm S_ANDN2_WREXEC_B32 : SOP1_Real_gfx10<0x047>;
defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>;
//===----------------------------------------------------------------------===//
-// SOP1 - GFX6, GFX7.
+// SOP1 - GFX6, GFX7, GFX10, GFX11.
//===----------------------------------------------------------------------===//
@@ -1506,6 +1623,9 @@ multiclass SOP1_Real_gfx6_gfx7<bits<8> op> {
multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> :
SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>;
+multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op> :
+ SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10_gfx11<op>;
+
defm S_CBRANCH_JOIN : SOP1_Real_gfx6_gfx7<0x032>;
defm S_MOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x003>;
@@ -1547,7 +1667,7 @@ defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x027>;
defm S_ORN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x028>;
defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x029>;
defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02a>;
-defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02b>;
+defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx11<0x02b>;
defm S_QUADMASK_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02c>;
defm S_QUADMASK_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02d>;
defm S_MOVRELS_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02e>;
@@ -1557,6 +1677,65 @@ defm S_MOVRELD_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x031>;
defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>;
//===----------------------------------------------------------------------===//
+// SOP2 - GFX11.
+//===----------------------------------------------------------------------===//
+
+multiclass SOP2_Real_gfx11<bits<7> op> {
+ def _gfx11 : SOP2_Real<op, !cast<SOP2_Pseudo>(NAME)>,
+ Select_gfx11<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOP2_Real_Renamed_gfx11<bits<7> op, SOP2_Pseudo backing_pseudo, string real_name> {
+ def _gfx11 : SOP2_Real<op, backing_pseudo, real_name>,
+ Select_gfx11<backing_pseudo.Mnemonic>,
+ MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+}
+
+defm S_ABSDIFF_I32 : SOP2_Real_gfx11<0x006>;
+defm S_LSHL_B32 : SOP2_Real_gfx11<0x008>;
+defm S_LSHL_B64 : SOP2_Real_gfx11<0x009>;
+defm S_LSHR_B32 : SOP2_Real_gfx11<0x00a>;
+defm S_LSHR_B64 : SOP2_Real_gfx11<0x00b>;
+defm S_ASHR_I32 : SOP2_Real_gfx11<0x00c>;
+defm S_ASHR_I64 : SOP2_Real_gfx11<0x00d>;
+defm S_LSHL1_ADD_U32 : SOP2_Real_gfx11<0x00e>;
+defm S_LSHL2_ADD_U32 : SOP2_Real_gfx11<0x00f>;
+defm S_LSHL3_ADD_U32 : SOP2_Real_gfx11<0x010>;
+defm S_LSHL4_ADD_U32 : SOP2_Real_gfx11<0x011>;
+defm S_MIN_I32 : SOP2_Real_gfx11<0x012>;
+defm S_MIN_U32 : SOP2_Real_gfx11<0x013>;
+defm S_MAX_I32 : SOP2_Real_gfx11<0x014>;
+defm S_MAX_U32 : SOP2_Real_gfx11<0x015>;
+defm S_AND_B32 : SOP2_Real_gfx11<0x016>;
+defm S_AND_B64 : SOP2_Real_gfx11<0x017>;
+defm S_OR_B32 : SOP2_Real_gfx11<0x018>;
+defm S_OR_B64 : SOP2_Real_gfx11<0x019>;
+defm S_XOR_B32 : SOP2_Real_gfx11<0x01a>;
+defm S_XOR_B64 : SOP2_Real_gfx11<0x01b>;
+defm S_NAND_B32 : SOP2_Real_gfx11<0x01c>;
+defm S_NAND_B64 : SOP2_Real_gfx11<0x01d>;
+defm S_NOR_B32 : SOP2_Real_gfx11<0x01e>;
+defm S_NOR_B64 : SOP2_Real_gfx11<0x01f>;
+defm S_XNOR_B32 : SOP2_Real_gfx11<0x020>;
+defm S_XNOR_B64 : SOP2_Real_gfx11<0x021>;
+defm S_AND_NOT1_B32 : SOP2_Real_Renamed_gfx11<0x022, S_ANDN2_B32, "s_and_not1_b32">;
+defm S_AND_NOT1_B64 : SOP2_Real_Renamed_gfx11<0x023, S_ANDN2_B64, "s_and_not1_b64">;
+defm S_OR_NOT1_B32 : SOP2_Real_Renamed_gfx11<0x024, S_ORN2_B32, "s_or_not1_b32">;
+defm S_OR_NOT1_B64 : SOP2_Real_Renamed_gfx11<0x025, S_ORN2_B64, "s_or_not1_b64">;
+defm S_BFE_U32 : SOP2_Real_gfx11<0x026>;
+defm S_BFE_I32 : SOP2_Real_gfx11<0x027>;
+defm S_BFE_U64 : SOP2_Real_gfx11<0x028>;
+defm S_BFE_I64 : SOP2_Real_gfx11<0x029>;
+defm S_BFM_B32 : SOP2_Real_gfx11<0x02a>;
+defm S_BFM_B64 : SOP2_Real_gfx11<0x02b>;
+defm S_MUL_I32 : SOP2_Real_gfx11<0x02c>;
+defm S_MUL_HI_U32 : SOP2_Real_gfx11<0x02d>;
+defm S_MUL_HI_I32 : SOP2_Real_gfx11<0x02e>;
+defm S_CSELECT_B32 : SOP2_Real_gfx11<0x030>;
+defm S_CSELECT_B64 : SOP2_Real_gfx11<0x031>;
+defm S_PACK_HL_B32_B16 : SOP2_Real_gfx11<0x035>;
+
+//===----------------------------------------------------------------------===//
// SOP2 - GFX10.
//===----------------------------------------------------------------------===//
@@ -1566,13 +1745,16 @@ multiclass SOP2_Real_gfx10<bits<7> op> {
Select_gfx10<ps.Mnemonic>;
}
+multiclass SOP2_Real_gfx10_gfx11<bits<7> op> :
+ SOP2_Real_gfx10<op>, SOP2_Real_gfx11<op>;
+
defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10<0x02e>;
defm S_LSHL2_ADD_U32 : SOP2_Real_gfx10<0x02f>;
defm S_LSHL3_ADD_U32 : SOP2_Real_gfx10<0x030>;
defm S_LSHL4_ADD_U32 : SOP2_Real_gfx10<0x031>;
-defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10<0x032>;
-defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10<0x033>;
-defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10<0x034>;
+defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10_gfx11<0x032>;
+defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10_gfx11<0x033>;
+defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10_gfx11<0x034>;
defm S_MUL_HI_U32 : SOP2_Real_gfx10<0x035>;
defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>;
@@ -1589,14 +1771,17 @@ multiclass SOP2_Real_gfx6_gfx7<bits<7> op> {
multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> :
SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>;
+multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx11<bits<7> op> :
+ SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10_gfx11<op>;
+
defm S_CBRANCH_G_FORK : SOP2_Real_gfx6_gfx7<0x02b>;
-defm S_ADD_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x000>;
-defm S_SUB_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x001>;
-defm S_ADD_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x002>;
-defm S_SUB_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x003>;
-defm S_ADDC_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x004>;
-defm S_SUBB_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x005>;
+defm S_ADD_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x000>;
+defm S_SUB_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x001>;
+defm S_ADD_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x002>;
+defm S_SUB_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x003>;
+defm S_ADDC_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x004>;
+defm S_SUBB_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x005>;
defm S_MIN_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x006>;
defm S_MIN_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x007>;
defm S_MAX_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x008>;
@@ -1635,6 +1820,31 @@ defm S_BFE_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x02a>;
defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>;
//===----------------------------------------------------------------------===//
+// SOPK - GFX11.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx11<bits<5> op> {
+ def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOPK_Real64_gfx11<bits<5> op> {
+ def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
+defm S_GETREG_B32 : SOPK_Real32_gfx11<0x011>;
+defm S_SETREG_B32 : SOPK_Real32_gfx11<0x012>;
+defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx11<0x013>;
+defm S_CALL_B64 : SOPK_Real32_gfx11<0x014>;
+defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>;
+defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx11<0x017>;
+defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11<0x018>;
+defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11<0x019>;
+defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11<0x01a>;
+defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11<0x01b>;
+
+//===----------------------------------------------------------------------===//
// SOPK - GFX10.
//===----------------------------------------------------------------------===//
@@ -1650,7 +1860,10 @@ multiclass SOPK_Real64_gfx10<bits<5> op> {
Select_gfx10<ps.Mnemonic>;
}
-defm S_VERSION : SOPK_Real32_gfx10<0x001>;
+multiclass SOPK_Real32_gfx10_gfx11<bits<5> op> :
+ SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>;
+
+defm S_VERSION : SOPK_Real32_gfx10_gfx11<0x001>;
defm S_CALL_B64 : SOPK_Real32_gfx10<0x016>;
defm S_WAITCNT_VSCNT : SOPK_Real32_gfx10<0x017>;
defm S_WAITCNT_VMCNT : SOPK_Real32_gfx10<0x018>;
@@ -1681,29 +1894,96 @@ multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> :
multiclass SOPK_Real64_gfx6_gfx7_gfx10<bits<5> op> :
SOPK_Real64_gfx6_gfx7<op>, SOPK_Real64_gfx10<op>;
+multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11<bits<5> op> :
+ SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10_gfx11<op>;
+
defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>;
-defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x000>;
-defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x002>;
-defm S_CMPK_EQ_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x003>;
-defm S_CMPK_LG_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x004>;
-defm S_CMPK_GT_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x005>;
-defm S_CMPK_GE_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x006>;
-defm S_CMPK_LT_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x007>;
-defm S_CMPK_LE_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x008>;
-defm S_CMPK_EQ_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x009>;
-defm S_CMPK_LG_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00a>;
-defm S_CMPK_GT_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00b>;
-defm S_CMPK_GE_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00c>;
-defm S_CMPK_LT_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00d>;
-defm S_CMPK_LE_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00e>;
-defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00f>;
-defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x010>;
+defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x000>;
+defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x002>;
+defm S_CMPK_EQ_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x003>;
+defm S_CMPK_LG_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x004>;
+defm S_CMPK_GT_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x005>;
+defm S_CMPK_GE_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x006>;
+defm S_CMPK_LT_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x007>;
+defm S_CMPK_LE_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x008>;
+defm S_CMPK_EQ_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x009>;
+defm S_CMPK_LG_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00a>;
+defm S_CMPK_GT_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00b>;
+defm S_CMPK_GE_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00c>;
+defm S_CMPK_LT_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00d>;
+defm S_CMPK_LE_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00e>;
+defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00f>;
+defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x010>;
defm S_GETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x012>;
defm S_SETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x013>;
defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>;
//===----------------------------------------------------------------------===//
+// SOPP - GFX11
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+ def _gfx11 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+ Select_gfx11<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">;
+}
+
+multiclass SOPP_Real_64_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+ def _gfx11 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+ Select_gfx11<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">;
+}
+
+multiclass SOPP_Real_32_Renamed_gfx11<bits<7> op, SOPP_Pseudo backing_pseudo, string real_name> {
+ def _gfx11 : SOPP_Real_32<op, backing_pseudo, real_name # " ">,
+ Select_gfx11<backing_pseudo.Mnemonic>,
+ MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+}
+
+multiclass SOPP_Real_With_Relaxation_gfx11<bits<7> op> {
+ defm "" : SOPP_Real_32_gfx11<op>;
+ defm _pad_s_nop : SOPP_Real_64_gfx11<op>;
+}
+
+defm S_SETKILL : SOPP_Real_32_gfx11<0x001>;
+defm S_SETHALT : SOPP_Real_32_gfx11<0x002>;
+defm S_SLEEP : SOPP_Real_32_gfx11<0x003>;
+defm S_SET_INST_PREFETCH_DISTANCE : SOPP_Real_32_Renamed_gfx11<0x004, S_INST_PREFETCH, "s_set_inst_prefetch_distance">;
+defm S_CLAUSE : SOPP_Real_32_gfx11<0x005>;
+defm S_DELAY_ALU : SOPP_Real_32_gfx11<0x007>;
+defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx11<0x008>;
+defm S_WAITCNT : SOPP_Real_32_gfx11<0x009>;
+defm S_WAIT_IDLE : SOPP_Real_32_gfx11<0x00a>;
+defm S_WAIT_EVENT : SOPP_Real_32_gfx11<0x00b>;
+defm S_TRAP : SOPP_Real_32_gfx11<0x010>;
+defm S_ROUND_MODE : SOPP_Real_32_gfx11<0x011>;
+defm S_DENORM_MODE : SOPP_Real_32_gfx11<0x012>;
+defm S_BRANCH : SOPP_Real_With_Relaxation_gfx11<0x020>;
+defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx11<0x021>;
+defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx11<0x022>;
+defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx11<0x023>;
+defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx11<0x024>;
+defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx11<0x025>;
+defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx11<0x026>;
+defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx11<0x027>;
+defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx11<0x028>;
+defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx11<0x029>;
+defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx11<0x02a>;
+defm S_ENDPGM : SOPP_Real_32_gfx11<0x030, "s_endpgm">;
+defm S_ENDPGM_SAVED : SOPP_Real_32_gfx11<0x031>;
+defm S_WAKEUP : SOPP_Real_32_gfx11<0x034>;
+defm S_SETPRIO : SOPP_Real_32_gfx11<0x035>;
+defm S_SENDMSG : SOPP_Real_32_gfx11<0x036>;
+defm S_SENDMSGHALT : SOPP_Real_32_gfx11<0x037>;
+defm S_INCPERFLEVEL : SOPP_Real_32_gfx11<0x038>;
+defm S_DECPERFLEVEL : SOPP_Real_32_gfx11<0x039>;
+defm S_TTRACEDATA : SOPP_Real_32_gfx11<0x03a>;
+defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx11<0x03b>;
+defm S_ICACHE_INV : SOPP_Real_32_gfx11<0x03c>;
+defm S_BARRIER : SOPP_Real_32_gfx11<0x03d>;
+
+//===----------------------------------------------------------------------===//
// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10
//===----------------------------------------------------------------------===//
@@ -1737,6 +2017,12 @@ multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<bits<7> op, string real_name = !cast
multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<op, real_name>, SOPP_Real_32_gfx10<op, real_name>;
+multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+ SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op, real_name>, SOPP_Real_32_gfx11<op, real_name>;
+
+multiclass SOPP_Real_32_gfx10_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+ SOPP_Real_32_gfx10<op, real_name>, SOPP_Real_32_gfx11<op, real_name>;
+
//64 bit encodings, for Relaxation
multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
defvar ps = !cast<SOPP_Pseudo>(NAME);
@@ -1768,13 +2054,16 @@ multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<bits<7> op, string real_name = !cast
multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<op, real_name>, SOPP_Real_64_gfx10<op, real_name>;
+multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+ SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op, real_name>, SOPP_Real_64_gfx11<op, real_name>;
+
//relaxation for insts with no operands not implemented
multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> {
defm "" : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>;
defm _pad_s_nop : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>;
}
-defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x000>;
+defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<0x000>;
defm S_ENDPGM : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x001, "s_endpgm">;
defm S_WAKEUP : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>;
defm S_BARRIER : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>;
@@ -1794,7 +2083,7 @@ defm S_ENDPGM_SAVED : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x01B>;
defm S_SET_GPR_IDX_OFF : SOPP_Real_32_gfx8_gfx9<0x01c>;
defm S_SET_GPR_IDX_MODE : SOPP_Real_32_gfx8_gfx9<0x01d>;
defm S_ENDPGM_ORDERED_PS_DONE : SOPP_Real_32_gfx8_gfx9_gfx10<0x01e>;
-defm S_CODE_END : SOPP_Real_32_gfx10<0x01f>;
+defm S_CODE_END : SOPP_Real_32_gfx10_gfx11<0x01f>;
defm S_INST_PREFETCH : SOPP_Real_32_gfx10<0x020>;
defm S_CLAUSE : SOPP_Real_32_gfx10<0x021>;
defm S_WAIT_IDLE : SOPP_Real_32_gfx10<0x022>;
@@ -1818,6 +2107,34 @@ defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_
}
//===----------------------------------------------------------------------===//
+// SOPC - GFX11
+//===----------------------------------------------------------------------===//
+
+multiclass SOPC_Real_gfx11<bits<7> op> {
+ def _gfx11 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
+ Select_gfx11<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+}
+
+defm S_CMP_EQ_I32 : SOPC_Real_gfx11<0x00>;
+defm S_CMP_LG_I32 : SOPC_Real_gfx11<0x01>;
+defm S_CMP_GT_I32 : SOPC_Real_gfx11<0x02>;
+defm S_CMP_GE_I32 : SOPC_Real_gfx11<0x03>;
+defm S_CMP_LT_I32 : SOPC_Real_gfx11<0x04>;
+defm S_CMP_LE_I32 : SOPC_Real_gfx11<0x05>;
+defm S_CMP_EQ_U32 : SOPC_Real_gfx11<0x06>;
+defm S_CMP_LG_U32 : SOPC_Real_gfx11<0x07>;
+defm S_CMP_GT_U32 : SOPC_Real_gfx11<0x08>;
+defm S_CMP_GE_U32 : SOPC_Real_gfx11<0x09>;
+defm S_CMP_LT_U32 : SOPC_Real_gfx11<0x0a>;
+defm S_CMP_LE_U32 : SOPC_Real_gfx11<0x0b>;
+defm S_BITCMP0_B32 : SOPC_Real_gfx11<0x0c>;
+defm S_BITCMP1_B32 : SOPC_Real_gfx11<0x0d>;
+defm S_BITCMP0_B64 : SOPC_Real_gfx11<0x0e>;
+defm S_BITCMP1_B64 : SOPC_Real_gfx11<0x0f>;
+defm S_CMP_EQ_U64 : SOPC_Real_gfx11<0x10>;
+defm S_CMP_LG_U64 : SOPC_Real_gfx11<0x11>;
+
+//===----------------------------------------------------------------------===//
// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 18c348d1cf89..c0fd5bc69325 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -6,33 +6,64 @@
//
//===----------------------------------------------------------------------===//
#include "AMDGPUAsmUtils.h"
+#include "AMDGPUBaseInfo.h"
#include "SIDefines.h"
-#include "llvm/ADT/StringRef.h"
-
namespace llvm {
namespace AMDGPU {
+
+namespace DepCtr {
+
+// NOLINTBEGIN
+const CustomOperandVal DepCtrInfo[] = {
+ // Name max dflt offset width constraint
+ {{"depctr_hold_cnt"}, 1, 1, 7, 1, isGFX10_BEncoding},
+ {{"depctr_sa_sdst"}, 1, 1, 0, 1},
+ {{"depctr_va_vdst"}, 15, 15, 12, 4},
+ {{"depctr_va_sdst"}, 7, 7, 9, 3},
+ {{"depctr_va_ssrc"}, 1, 1, 8, 1},
+ {{"depctr_va_vcc"}, 1, 1, 1, 1},
+ {{"depctr_vm_vsrc"}, 7, 7, 2, 3},
+};
+// NOLINTEND
+
+const int DEP_CTR_SIZE =
+ static_cast<int>(sizeof(DepCtrInfo) / sizeof(CustomOperandVal));
+
+} // namespace DepCtr
+
namespace SendMsg {
-// This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h.
-const char *const IdSymbolic[ID_GAPS_LAST_] = {
- nullptr,
- "MSG_INTERRUPT",
- "MSG_GS",
- "MSG_GS_DONE",
- "MSG_SAVEWAVE",
- "MSG_STALL_WAVE_GEN",
- "MSG_HALT_WAVES",
- "MSG_ORDERED_PS_DONE",
- "MSG_EARLY_PRIM_DEALLOC",
- "MSG_GS_ALLOC_REQ",
- "MSG_GET_DOORBELL",
- "MSG_GET_DDID",
- nullptr,
- nullptr,
- nullptr,
- "MSG_SYSMSG"
+// Disable lint checking for this block since it makes the table unreadable.
+// NOLINTBEGIN
+const CustomOperand<const MCSubtargetInfo &> Msg[] = {
+ {{""}},
+ {{"MSG_INTERRUPT"}, ID_INTERRUPT},
+ {{"MSG_GS"}, ID_GS_PreGFX11, isNotGFX11Plus},
+ {{"MSG_GS_DONE"}, ID_GS_DONE_PreGFX11, isNotGFX11Plus},
+ {{"MSG_SAVEWAVE"}, ID_SAVEWAVE, isGFX8_GFX9_GFX10},
+ {{"MSG_STALL_WAVE_GEN"}, ID_STALL_WAVE_GEN, isGFX9Plus},
+ {{"MSG_HALT_WAVES"}, ID_HALT_WAVES, isGFX9Plus},
+ {{"MSG_ORDERED_PS_DONE"}, ID_ORDERED_PS_DONE, isGFX9Plus},
+ {{"MSG_EARLY_PRIM_DEALLOC"}, ID_EARLY_PRIM_DEALLOC, isGFX9_GFX10},
+ {{"MSG_GS_ALLOC_REQ"}, ID_GS_ALLOC_REQ, isGFX9Plus},
+ {{"MSG_GET_DOORBELL"}, ID_GET_DOORBELL, isGFX9_GFX10},
+ {{"MSG_GET_DDID"}, ID_GET_DDID, isGFX10},
+ {{"MSG_HS_TESSFACTOR"}, ID_HS_TESSFACTOR_GFX11Plus, isGFX11Plus},
+ {{"MSG_DEALLOC_VGPRS"}, ID_DEALLOC_VGPRS_GFX11Plus, isGFX11Plus},
+ {{""}},
+ {{"MSG_SYSMSG"}, ID_SYSMSG},
+ {{"MSG_RTN_GET_DOORBELL"}, ID_RTN_GET_DOORBELL, isGFX11Plus},
+ {{"MSG_RTN_GET_DDID"}, ID_RTN_GET_DDID, isGFX11Plus},
+ {{"MSG_RTN_GET_TMA"}, ID_RTN_GET_TMA, isGFX11Plus},
+ {{"MSG_RTN_GET_REALTIME"}, ID_RTN_GET_REALTIME, isGFX11Plus},
+ {{"MSG_RTN_SAVE_WAVE"}, ID_RTN_SAVE_WAVE, isGFX11Plus},
+ {{"MSG_RTN_GET_TBA"}, ID_RTN_GET_TBA, isGFX11Plus},
};
+// NOLINTEND
+
+const int MSG_SIZE = static_cast<int>(
+ sizeof(Msg) / sizeof(CustomOperand<const MCSubtargetInfo &>));
// These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h.
const char *const OpSysSymbolic[OP_SYS_LAST_] = {
@@ -54,39 +85,54 @@ const char *const OpGsSymbolic[OP_GS_LAST_] = {
namespace Hwreg {
-// This must be in sync with llvm::AMDGPU::Hwreg::ID_SYMBOLIC_FIRST_/LAST_, see SIDefines.h.
-const char* const IdSymbolic[] = {
- nullptr,
- "HW_REG_MODE",
- "HW_REG_STATUS",
- "HW_REG_TRAPSTS",
- "HW_REG_HW_ID",
- "HW_REG_GPR_ALLOC",
- "HW_REG_LDS_ALLOC",
- "HW_REG_IB_STS",
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- "HW_REG_SH_MEM_BASES",
- "HW_REG_TBA_LO",
- "HW_REG_TBA_HI",
- "HW_REG_TMA_LO",
- "HW_REG_TMA_HI",
- "HW_REG_FLAT_SCR_LO",
- "HW_REG_FLAT_SCR_HI",
- "HW_REG_XNACK_MASK",
- "HW_REG_HW_ID1",
- "HW_REG_HW_ID2",
- "HW_REG_POPS_PACKER",
- nullptr,
- nullptr,
- nullptr,
- "HW_REG_SHADER_CYCLES"
+// Disable lint checking for this block since it makes the table unreadable.
+// NOLINTBEGIN
+const CustomOperand<const MCSubtargetInfo &> Opr[] = {
+ {{""}},
+ {{"HW_REG_MODE"}, ID_MODE},
+ {{"HW_REG_STATUS"}, ID_STATUS},
+ {{"HW_REG_TRAPSTS"}, ID_TRAPSTS},
+ {{"HW_REG_HW_ID"}, ID_HW_ID, isNotGFX10Plus},
+ {{"HW_REG_GPR_ALLOC"}, ID_GPR_ALLOC},
+ {{"HW_REG_LDS_ALLOC"}, ID_LDS_ALLOC},
+ {{"HW_REG_IB_STS"}, ID_IB_STS},
+ {{""}},
+ {{""}},
+ {{""}},
+ {{""}},
+ {{""}},
+ {{""}},
+ {{""}},
+ {{"HW_REG_SH_MEM_BASES"}, ID_MEM_BASES, isGFX9Plus},
+ {{"HW_REG_TBA_LO"}, ID_TBA_LO, isGFX9_GFX10},
+ {{"HW_REG_TBA_HI"}, ID_TBA_HI, isGFX9_GFX10},
+ {{"HW_REG_TMA_LO"}, ID_TMA_LO, isGFX9_GFX10},
+ {{"HW_REG_TMA_HI"}, ID_TMA_HI, isGFX9_GFX10},
+ {{"HW_REG_FLAT_SCR_LO"}, ID_FLAT_SCR_LO, isGFX10Plus},
+ {{"HW_REG_FLAT_SCR_HI"}, ID_FLAT_SCR_HI, isGFX10Plus},
+ {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK, isGFX10Before1030},
+ {{"HW_REG_HW_ID1"}, ID_HW_ID1, isGFX10Plus},
+ {{"HW_REG_HW_ID2"}, ID_HW_ID2, isGFX10Plus},
+ {{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10},
+ {{""}},
+ {{""}},
+ {{""}},
+ {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_BEncoding},
+
+ // GFX940 specific registers
+ {{"HW_REG_XCC_ID"}, ID_XCC_ID, isGFX940},
+ {{"HW_REG_SQ_PERF_SNAPSHOT_DATA"}, ID_SQ_PERF_SNAPSHOT_DATA, isGFX940},
+ {{"HW_REG_SQ_PERF_SNAPSHOT_DATA1"}, ID_SQ_PERF_SNAPSHOT_DATA1, isGFX940},
+ {{"HW_REG_SQ_PERF_SNAPSHOT_PC_LO"}, ID_SQ_PERF_SNAPSHOT_PC_LO, isGFX940},
+ {{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI, isGFX940},
+
+ // Aliases
+ {{"HW_REG_HW_ID"}, ID_HW_ID1, isGFX10},
};
+// NOLINTEND
+
+const int OPR_SIZE = static_cast<int>(
+ sizeof(Opr) / sizeof(CustomOperand<const MCSubtargetInfo &>));
} // namespace Hwreg
@@ -144,7 +190,7 @@ StringLiteral const NfmtSymbolicVI[] = { // VI and GFX9
"BUF_NUM_FORMAT_FLOAT"
};
-StringLiteral const UfmtSymbolic[] = {
+StringLiteral const UfmtSymbolicGFX10[] = {
"BUF_FMT_INVALID",
"BUF_FMT_8_UNORM",
@@ -238,7 +284,7 @@ StringLiteral const UfmtSymbolic[] = {
"BUF_FMT_32_32_32_32_FLOAT"
};
-unsigned const DfmtNfmt2UFmt[] = {
+unsigned const DfmtNfmt2UFmtGFX10[] = {
DFMT_INVALID | (NFMT_UNORM << NFMT_SHIFT),
DFMT_8 | (NFMT_UNORM << NFMT_SHIFT),
@@ -332,6 +378,166 @@ unsigned const DfmtNfmt2UFmt[] = {
DFMT_32_32_32_32 | (NFMT_FLOAT << NFMT_SHIFT)
};
+StringLiteral const UfmtSymbolicGFX11[] = {
+ "BUF_FMT_INVALID",
+
+ "BUF_FMT_8_UNORM",
+ "BUF_FMT_8_SNORM",
+ "BUF_FMT_8_USCALED",
+ "BUF_FMT_8_SSCALED",
+ "BUF_FMT_8_UINT",
+ "BUF_FMT_8_SINT",
+
+ "BUF_FMT_16_UNORM",
+ "BUF_FMT_16_SNORM",
+ "BUF_FMT_16_USCALED",
+ "BUF_FMT_16_SSCALED",
+ "BUF_FMT_16_UINT",
+ "BUF_FMT_16_SINT",
+ "BUF_FMT_16_FLOAT",
+
+ "BUF_FMT_8_8_UNORM",
+ "BUF_FMT_8_8_SNORM",
+ "BUF_FMT_8_8_USCALED",
+ "BUF_FMT_8_8_SSCALED",
+ "BUF_FMT_8_8_UINT",
+ "BUF_FMT_8_8_SINT",
+
+ "BUF_FMT_32_UINT",
+ "BUF_FMT_32_SINT",
+ "BUF_FMT_32_FLOAT",
+
+ "BUF_FMT_16_16_UNORM",
+ "BUF_FMT_16_16_SNORM",
+ "BUF_FMT_16_16_USCALED",
+ "BUF_FMT_16_16_SSCALED",
+ "BUF_FMT_16_16_UINT",
+ "BUF_FMT_16_16_SINT",
+ "BUF_FMT_16_16_FLOAT",
+
+ "BUF_FMT_10_11_11_FLOAT",
+
+ "BUF_FMT_11_11_10_FLOAT",
+
+ "BUF_FMT_10_10_10_2_UNORM",
+ "BUF_FMT_10_10_10_2_SNORM",
+ "BUF_FMT_10_10_10_2_UINT",
+ "BUF_FMT_10_10_10_2_SINT",
+
+ "BUF_FMT_2_10_10_10_UNORM",
+ "BUF_FMT_2_10_10_10_SNORM",
+ "BUF_FMT_2_10_10_10_USCALED",
+ "BUF_FMT_2_10_10_10_SSCALED",
+ "BUF_FMT_2_10_10_10_UINT",
+ "BUF_FMT_2_10_10_10_SINT",
+
+ "BUF_FMT_8_8_8_8_UNORM",
+ "BUF_FMT_8_8_8_8_SNORM",
+ "BUF_FMT_8_8_8_8_USCALED",
+ "BUF_FMT_8_8_8_8_SSCALED",
+ "BUF_FMT_8_8_8_8_UINT",
+ "BUF_FMT_8_8_8_8_SINT",
+
+ "BUF_FMT_32_32_UINT",
+ "BUF_FMT_32_32_SINT",
+ "BUF_FMT_32_32_FLOAT",
+
+ "BUF_FMT_16_16_16_16_UNORM",
+ "BUF_FMT_16_16_16_16_SNORM",
+ "BUF_FMT_16_16_16_16_USCALED",
+ "BUF_FMT_16_16_16_16_SSCALED",
+ "BUF_FMT_16_16_16_16_UINT",
+ "BUF_FMT_16_16_16_16_SINT",
+ "BUF_FMT_16_16_16_16_FLOAT",
+
+ "BUF_FMT_32_32_32_UINT",
+ "BUF_FMT_32_32_32_SINT",
+ "BUF_FMT_32_32_32_FLOAT",
+ "BUF_FMT_32_32_32_32_UINT",
+ "BUF_FMT_32_32_32_32_SINT",
+ "BUF_FMT_32_32_32_32_FLOAT"
+};
+
+unsigned const DfmtNfmt2UFmtGFX11[] = {
+ DFMT_INVALID | (NFMT_UNORM << NFMT_SHIFT),
+
+ DFMT_8 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_8 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_8 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_8 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_8 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_8 | (NFMT_SINT << NFMT_SHIFT),
+
+ DFMT_16 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_16 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_16 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_16 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_16 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_16 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_16 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_8_8 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_8_8 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_8_8 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_8_8 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_8_8 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_8_8 | (NFMT_SINT << NFMT_SHIFT),
+
+ DFMT_32 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_32 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_32 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_16_16 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_16_16 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_16_16 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_16_16 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_16_16 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_16_16 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_16_16 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_10_11_11 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_11_11_10 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_10_10_10_2 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_10_10_10_2 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_10_10_10_2 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_10_10_10_2 | (NFMT_SINT << NFMT_SHIFT),
+
+ DFMT_2_10_10_10 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_2_10_10_10 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_2_10_10_10 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_2_10_10_10 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_2_10_10_10 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_2_10_10_10 | (NFMT_SINT << NFMT_SHIFT),
+
+ DFMT_8_8_8_8 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_8_8_8_8 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_8_8_8_8 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_8_8_8_8 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_8_8_8_8 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_8_8_8_8 | (NFMT_SINT << NFMT_SHIFT),
+
+ DFMT_32_32 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_32_32 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_32_32 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_16_16_16_16 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_16_16_16_16 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_16_16_16_16 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_16_16_16_16 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_16_16_16_16 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_16_16_16_16 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_16_16_16_16 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_32_32_32 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_32_32_32 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_32_32_32 | (NFMT_FLOAT << NFMT_SHIFT),
+ DFMT_32_32_32_32 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_32_32_32_32 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_32_32_32_32 | (NFMT_FLOAT << NFMT_SHIFT)
+};
+
} // namespace MTBUFFormat
namespace Swizzle {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index d1deb570a938..054e35e90f2f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -11,15 +11,60 @@
#include "SIDefines.h"
+#include "llvm/ADT/StringRef.h"
+
namespace llvm {
class StringLiteral;
+class MCSubtargetInfo;
namespace AMDGPU {
+const int OPR_ID_UNKNOWN = -1;
+const int OPR_ID_UNSUPPORTED = -2;
+const int OPR_ID_DUPLICATE = -3;
+const int OPR_VAL_INVALID = -4;
+
+template <class T> struct CustomOperand {
+ StringLiteral Name;
+ int Encoding = 0;
+ bool (*Cond)(T Context) = nullptr;
+};
+
+struct CustomOperandVal {
+ StringLiteral Name;
+ unsigned Max;
+ unsigned Default;
+ unsigned Shift;
+ unsigned Width;
+ bool (*Cond)(const MCSubtargetInfo &STI) = nullptr;
+ unsigned Mask = (1 << Width) - 1;
+
+ unsigned decode(unsigned Code) const { return (Code >> Shift) & Mask; }
+
+ unsigned encode(unsigned Val) const { return (Val & Mask) << Shift; }
+
+ unsigned getMask() const { return Mask << Shift; }
+
+ bool isValid(unsigned Val) const { return Val <= Max; }
+
+ bool isSupported(const MCSubtargetInfo &STI) const {
+ return !Cond || Cond(STI);
+ }
+};
+
+namespace DepCtr {
+
+extern const CustomOperandVal DepCtrInfo[];
+extern const int DEP_CTR_SIZE;
+
+} // namespace DepCtr
+
namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.
-extern const char *const IdSymbolic[ID_GAPS_LAST_];
+extern const CustomOperand<const MCSubtargetInfo &> Msg[];
+extern const int MSG_SIZE;
+
extern const char *const OpSysSymbolic[OP_SYS_LAST_];
extern const char *const OpGsSymbolic[OP_GS_LAST_];
@@ -27,7 +72,8 @@ extern const char *const OpGsSymbolic[OP_GS_LAST_];
namespace Hwreg { // Symbolic names for the hwreg(...) syntax.
-extern const char* const IdSymbolic[];
+extern const CustomOperand<const MCSubtargetInfo &> Opr[];
+extern const int OPR_SIZE;
} // namespace Hwreg
@@ -37,8 +83,10 @@ extern StringLiteral const DfmtSymbolic[];
extern StringLiteral const NfmtSymbolicGFX10[];
extern StringLiteral const NfmtSymbolicSICI[];
extern StringLiteral const NfmtSymbolicVI[];
-extern StringLiteral const UfmtSymbolic[];
-extern unsigned const DfmtNfmt2UFmt[];
+extern StringLiteral const UfmtSymbolicGFX10[];
+extern StringLiteral const UfmtSymbolicGFX11[];
+extern unsigned const DfmtNfmt2UFmtGFX10[];
+extern unsigned const DfmtNfmt2UFmtGFX11[];
} // namespace MTBUFFormat
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 683be871ff82..e4ab72f1095b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -28,10 +28,15 @@
#define GET_INSTRMAP_INFO
#include "AMDGPUGenInstrInfo.inc"
-static llvm::cl::opt<unsigned> AmdhsaCodeObjectVersion(
- "amdhsa-code-object-version", llvm::cl::Hidden,
- llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(4),
- llvm::cl::ZeroOrMore);
+static llvm::cl::opt<unsigned>
+ AmdhsaCodeObjectVersion("amdhsa-code-object-version", llvm::cl::Hidden,
+ llvm::cl::desc("AMDHSA Code Object Version"),
+ llvm::cl::init(4));
+
+// TODO-GFX11: Remove this when full 16-bit codegen is implemented.
+static llvm::cl::opt<bool>
+ LimitTo128VGPRs("amdgpu-limit-to-128-vgprs", llvm::cl::Hidden,
+ llvm::cl::desc("Never use more than 128 VGPRs"));
namespace {
@@ -44,9 +49,8 @@ unsigned getBitMask(unsigned Shift, unsigned Width) {
///
/// \returns Packed \p Dst.
unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
- Dst &= ~(1 << Shift) & ~getBitMask(Shift, Width);
- Dst |= (Src << Shift) & getBitMask(Shift, Width);
- return Dst;
+ unsigned Mask = getBitMask(Shift, Width);
+ return ((Src << Shift) & Mask) | (Dst & ~Mask);
}
/// Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
@@ -57,30 +61,40 @@ unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
}
/// \returns Vmcnt bit shift (lower bits).
-unsigned getVmcntBitShiftLo() { return 0; }
+unsigned getVmcntBitShiftLo(unsigned VersionMajor) {
+ return VersionMajor >= 11 ? 10 : 0;
+}
/// \returns Vmcnt bit width (lower bits).
-unsigned getVmcntBitWidthLo() { return 4; }
+unsigned getVmcntBitWidthLo(unsigned VersionMajor) {
+ return VersionMajor >= 11 ? 6 : 4;
+}
/// \returns Expcnt bit shift.
-unsigned getExpcntBitShift() { return 4; }
+unsigned getExpcntBitShift(unsigned VersionMajor) {
+ return VersionMajor >= 11 ? 0 : 4;
+}
/// \returns Expcnt bit width.
-unsigned getExpcntBitWidth() { return 3; }
+unsigned getExpcntBitWidth(unsigned VersionMajor) { return 3; }
/// \returns Lgkmcnt bit shift.
-unsigned getLgkmcntBitShift() { return 8; }
+unsigned getLgkmcntBitShift(unsigned VersionMajor) {
+ return VersionMajor >= 11 ? 4 : 8;
+}
/// \returns Lgkmcnt bit width.
unsigned getLgkmcntBitWidth(unsigned VersionMajor) {
- return (VersionMajor >= 10) ? 6 : 4;
+ return VersionMajor >= 10 ? 6 : 4;
}
/// \returns Vmcnt bit shift (higher bits).
-unsigned getVmcntBitShiftHi() { return 14; }
+unsigned getVmcntBitShiftHi(unsigned VersionMajor) { return 14; }
/// \returns Vmcnt bit width (higher bits).
-unsigned getVmcntBitWidthHi() { return 2; }
+unsigned getVmcntBitWidthHi(unsigned VersionMajor) {
+ return (VersionMajor == 9 || VersionMajor == 10) ? 2 : 0;
+}
} // end namespace anonymous
@@ -136,6 +150,41 @@ bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI) {
isHsaAbiVersion5(STI);
}
+unsigned getAmdhsaCodeObjectVersion() {
+ return AmdhsaCodeObjectVersion;
+}
+
+unsigned getMultigridSyncArgImplicitArgPosition() {
+ switch (AmdhsaCodeObjectVersion) {
+ case 2:
+ case 3:
+ case 4:
+ return 48;
+ case 5:
+ return AMDGPU::ImplicitArg::MULTIGRID_SYNC_ARG_OFFSET;
+ default:
+ llvm_unreachable("Unexpected code object version");
+ return 0;
+ }
+}
+
+
+// FIXME: All such magic numbers about the ABI should be in a
+// central TD file.
+unsigned getHostcallImplicitArgPosition() {
+ switch (AmdhsaCodeObjectVersion) {
+ case 2:
+ case 3:
+ case 4:
+ return 24;
+ case 5:
+ return AMDGPU::ImplicitArg::HOSTCALL_PTR_OFFSET;
+ default:
+ llvm_unreachable("Unexpected code object version");
+ return 0;
+ }
+}
+
#define GET_MIMGBaseOpcodesTable_IMPL
#define GET_MIMGDimInfoTable_IMPL
#define GET_MIMGInfoTable_IMPL
@@ -144,6 +193,7 @@ bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI) {
#define GET_MIMGBiasMappingTable_IMPL
#define GET_MIMGOffsetMappingTable_IMPL
#define GET_MIMGG16MappingTable_IMPL
+#define GET_MAIInstInfoTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -223,6 +273,10 @@ struct VOPInfo {
bool IsSingle;
};
+struct VOPC64DPPInfo {
+ uint16_t Opcode;
+};
+
#define GET_MTBUFInfoTable_DECL
#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
@@ -235,6 +289,14 @@ struct VOPInfo {
#define GET_VOP2InfoTable_IMPL
#define GET_VOP3InfoTable_DECL
#define GET_VOP3InfoTable_IMPL
+#define GET_VOPC64DPPTable_DECL
+#define GET_VOPC64DPPTable_IMPL
+#define GET_VOPC64DPP8Table_DECL
+#define GET_VOPC64DPP8Table_IMPL
+#define GET_WMMAOpcode2AddrMappingTable_DECL
+#define GET_WMMAOpcode2AddrMappingTable_IMPL
+#define GET_WMMAOpcode3AddrMappingTable_DECL
+#define GET_WMMAOpcode3AddrMappingTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMTBUFBaseOpcode(unsigned Opc) {
@@ -322,6 +384,30 @@ bool getVOP3IsSingle(unsigned Opc) {
return Info ? Info->IsSingle : false;
}
+bool isVOPC64DPP(unsigned Opc) {
+ return isVOPC64DPPOpcodeHelper(Opc) || isVOPC64DPP8OpcodeHelper(Opc);
+}
+
+bool getMAIIsDGEMM(unsigned Opc) {
+ const MAIInstInfo *Info = getMAIInstInfoHelper(Opc);
+ return Info ? Info->is_dgemm : false;
+}
+
+bool getMAIIsGFX940XDL(unsigned Opc) {
+ const MAIInstInfo *Info = getMAIInstInfoHelper(Opc);
+ return Info ? Info->is_gfx940_xdl : false;
+}
+
+unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
+ const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
+ return Info ? Info->Opcode3Addr : ~0u;
+}
+
+unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc) {
+ const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom3AddrOpcode(Opc);
+ return Info ? Info->Opcode2Addr : ~0u;
+}
+
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
// header files, so we need to wrap it in a function that takes unsigned
// instead.
@@ -740,6 +826,15 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
}
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
+ if (LimitTo128VGPRs.getNumOccurrences() ? LimitTo128VGPRs
+ : isGFX11Plus(*STI)) {
+ // GFX11 changes the encoding of 16-bit operands in VOP1/2/C instructions
+ // such that values 128..255 no longer mean v128..v255, they mean
+ // v0.hi..v127.hi instead. Until the compiler understands this, it is not
+ // safe to use v128..v255.
+ // TODO-GFX11: Remove this when full 16-bit codegen is implemented.
+ return 128;
+ }
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 512;
return 256;
@@ -904,16 +999,13 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
}
unsigned getVmcntBitMask(const IsaVersion &Version) {
- unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1;
- if (Version.Major < 9)
- return VmcntLo;
-
- unsigned VmcntHi = ((1 << getVmcntBitWidthHi()) - 1) << getVmcntBitWidthLo();
- return VmcntLo | VmcntHi;
+ return (1 << (getVmcntBitWidthLo(Version.Major) +
+ getVmcntBitWidthHi(Version.Major))) -
+ 1;
}
unsigned getExpcntBitMask(const IsaVersion &Version) {
- return (1 << getExpcntBitWidth()) - 1;
+ return (1 << getExpcntBitWidth(Version.Major)) - 1;
}
unsigned getLgkmcntBitMask(const IsaVersion &Version) {
@@ -921,36 +1013,32 @@ unsigned getLgkmcntBitMask(const IsaVersion &Version) {
}
unsigned getWaitcntBitMask(const IsaVersion &Version) {
- unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo());
- unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
- unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(),
+ unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major),
+ getVmcntBitWidthLo(Version.Major));
+ unsigned Expcnt = getBitMask(getExpcntBitShift(Version.Major),
+ getExpcntBitWidth(Version.Major));
+ unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(Version.Major),
getLgkmcntBitWidth(Version.Major));
- unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt;
- if (Version.Major < 9)
- return Waitcnt;
-
- unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(), getVmcntBitWidthHi());
- return Waitcnt | VmcntHi;
+ unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(Version.Major),
+ getVmcntBitWidthHi(Version.Major));
+ return VmcntLo | Expcnt | Lgkmcnt | VmcntHi;
}
unsigned decodeVmcnt(const IsaVersion &Version, unsigned Waitcnt) {
- unsigned VmcntLo =
- unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
- if (Version.Major < 9)
- return VmcntLo;
-
- unsigned VmcntHi =
- unpackBits(Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
- VmcntHi <<= getVmcntBitWidthLo();
- return VmcntLo | VmcntHi;
+ unsigned VmcntLo = unpackBits(Waitcnt, getVmcntBitShiftLo(Version.Major),
+ getVmcntBitWidthLo(Version.Major));
+ unsigned VmcntHi = unpackBits(Waitcnt, getVmcntBitShiftHi(Version.Major),
+ getVmcntBitWidthHi(Version.Major));
+ return VmcntLo | VmcntHi << getVmcntBitWidthLo(Version.Major);
}
unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt) {
- return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
+ return unpackBits(Waitcnt, getExpcntBitShift(Version.Major),
+ getExpcntBitWidth(Version.Major));
}
unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt) {
- return unpackBits(Waitcnt, getLgkmcntBitShift(),
+ return unpackBits(Waitcnt, getLgkmcntBitShift(Version.Major),
getLgkmcntBitWidth(Version.Major));
}
@@ -971,24 +1059,23 @@ Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) {
unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned Vmcnt) {
- Waitcnt =
- packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
- if (Version.Major < 9)
- return Waitcnt;
-
- Vmcnt >>= getVmcntBitWidthLo();
- return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
+ Waitcnt = packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(Version.Major),
+ getVmcntBitWidthLo(Version.Major));
+ return packBits(Vmcnt >> getVmcntBitWidthLo(Version.Major), Waitcnt,
+ getVmcntBitShiftHi(Version.Major),
+ getVmcntBitWidthHi(Version.Major));
}
unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned Expcnt) {
- return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
+ return packBits(Expcnt, Waitcnt, getExpcntBitShift(Version.Major),
+ getExpcntBitWidth(Version.Major));
}
unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned Lgkmcnt) {
- return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(),
- getLgkmcntBitWidth(Version.Major));
+ return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(Version.Major),
+ getLgkmcntBitWidth(Version.Major));
}
unsigned encodeWaitcnt(const IsaVersion &Version,
@@ -1005,43 +1092,184 @@ unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) {
}
//===----------------------------------------------------------------------===//
-// hwreg
+// Custom Operands.
+//
+// A table of custom operands shall describe "primary" operand names
+// first followed by aliases if any. It is not required but recommended
+// to arrange operands so that operand encoding match operand position
+// in the table. This will make disassembly a bit more efficient.
+// Unused slots in the table shall have an empty name.
+//
//===----------------------------------------------------------------------===//
-namespace Hwreg {
+template <class T>
+static bool isValidOpr(int Idx, const CustomOperand<T> OpInfo[], int OpInfoSize,
+ T Context) {
+ return 0 <= Idx && Idx < OpInfoSize && !OpInfo[Idx].Name.empty() &&
+ (!OpInfo[Idx].Cond || OpInfo[Idx].Cond(Context));
+}
-int64_t getHwregId(const StringRef Name) {
- for (int Id = ID_SYMBOLIC_FIRST_; Id < ID_SYMBOLIC_LAST_; ++Id) {
- if (IdSymbolic[Id] && Name == IdSymbolic[Id])
- return Id;
+template <class T>
+static int getOprIdx(std::function<bool(const CustomOperand<T> &)> Test,
+ const CustomOperand<T> OpInfo[], int OpInfoSize,
+ T Context) {
+ int InvalidIdx = OPR_ID_UNKNOWN;
+ for (int Idx = 0; Idx < OpInfoSize; ++Idx) {
+ if (Test(OpInfo[Idx])) {
+ if (!OpInfo[Idx].Cond || OpInfo[Idx].Cond(Context))
+ return Idx;
+ InvalidIdx = OPR_ID_UNSUPPORTED;
+ }
}
- return ID_UNKNOWN_;
+ return InvalidIdx;
}
-static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) {
- if (isSI(STI) || isCI(STI) || isVI(STI))
- return ID_SYMBOLIC_FIRST_GFX9_;
- else if (isGFX9(STI))
- return ID_SYMBOLIC_FIRST_GFX10_;
- else if (isGFX10(STI) && !isGFX10_BEncoding(STI))
- return ID_SYMBOLIC_FIRST_GFX1030_;
- else
- return ID_SYMBOLIC_LAST_;
+template <class T>
+static int getOprIdx(const StringRef Name, const CustomOperand<T> OpInfo[],
+ int OpInfoSize, T Context) {
+ auto Test = [=](const CustomOperand<T> &Op) { return Op.Name == Name; };
+ return getOprIdx<T>(Test, OpInfo, OpInfoSize, Context);
}
-bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) {
- switch (Id) {
- case ID_HW_ID:
- return isSI(STI) || isCI(STI) || isVI(STI) || isGFX9(STI);
- case ID_HW_ID1:
- case ID_HW_ID2:
- return isGFX10Plus(STI);
- case ID_XNACK_MASK:
- return isGFX10(STI) && !AMDGPU::isGFX10_BEncoding(STI);
- default:
- return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
- IdSymbolic[Id];
+template <class T>
+static int getOprIdx(int Id, const CustomOperand<T> OpInfo[], int OpInfoSize,
+ T Context, bool QuickCheck = true) {
+ auto Test = [=](const CustomOperand<T> &Op) {
+ return Op.Encoding == Id && !Op.Name.empty();
+ };
+ // This is an optimization that should work in most cases.
+ // As a side effect, it may cause selection of an alias
+ // instead of a primary operand name in case of sparse tables.
+ if (QuickCheck && isValidOpr<T>(Id, OpInfo, OpInfoSize, Context) &&
+ OpInfo[Id].Encoding == Id) {
+ return Id;
+ }
+ return getOprIdx<T>(Test, OpInfo, OpInfoSize, Context);
+}
+
+//===----------------------------------------------------------------------===//
+// Custom Operand Values
+//===----------------------------------------------------------------------===//
+
+static unsigned getDefaultCustomOperandEncoding(const CustomOperandVal *Opr,
+ int Size,
+ const MCSubtargetInfo &STI) {
+ unsigned Enc = 0;
+ for (int Idx = 0; Idx < Size; ++Idx) {
+ const auto &Op = Opr[Idx];
+ if (Op.isSupported(STI))
+ Enc |= Op.encode(Op.Default);
+ }
+ return Enc;
+}
+
+static bool isSymbolicCustomOperandEncoding(const CustomOperandVal *Opr,
+ int Size, unsigned Code,
+ bool &HasNonDefaultVal,
+ const MCSubtargetInfo &STI) {
+ unsigned UsedOprMask = 0;
+ HasNonDefaultVal = false;
+ for (int Idx = 0; Idx < Size; ++Idx) {
+ const auto &Op = Opr[Idx];
+ if (!Op.isSupported(STI))
+ continue;
+ UsedOprMask |= Op.getMask();
+ unsigned Val = Op.decode(Code);
+ if (!Op.isValid(Val))
+ return false;
+ HasNonDefaultVal |= (Val != Op.Default);
}
+ return (Code & ~UsedOprMask) == 0;
+}
+
+static bool decodeCustomOperand(const CustomOperandVal *Opr, int Size,
+ unsigned Code, int &Idx, StringRef &Name,
+ unsigned &Val, bool &IsDefault,
+ const MCSubtargetInfo &STI) {
+ while (Idx < Size) {
+ const auto &Op = Opr[Idx++];
+ if (Op.isSupported(STI)) {
+ Name = Op.Name;
+ Val = Op.decode(Code);
+ IsDefault = (Val == Op.Default);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static int encodeCustomOperandVal(const CustomOperandVal &Op,
+ int64_t InputVal) {
+ if (InputVal < 0 || InputVal > Op.Max)
+ return OPR_VAL_INVALID;
+ return Op.encode(InputVal);
+}
+
+static int encodeCustomOperand(const CustomOperandVal *Opr, int Size,
+ const StringRef Name, int64_t InputVal,
+ unsigned &UsedOprMask,
+ const MCSubtargetInfo &STI) {
+ int InvalidId = OPR_ID_UNKNOWN;
+ for (int Idx = 0; Idx < Size; ++Idx) {
+ const auto &Op = Opr[Idx];
+ if (Op.Name == Name) {
+ if (!Op.isSupported(STI)) {
+ InvalidId = OPR_ID_UNSUPPORTED;
+ continue;
+ }
+ auto OprMask = Op.getMask();
+ if (OprMask & UsedOprMask)
+ return OPR_ID_DUPLICATE;
+ UsedOprMask |= OprMask;
+ return encodeCustomOperandVal(Op, InputVal);
+ }
+ }
+ return InvalidId;
+}
+
+//===----------------------------------------------------------------------===//
+// DepCtr
+//===----------------------------------------------------------------------===//
+
+namespace DepCtr {
+
+int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI) {
+ static int Default = -1;
+ if (Default == -1)
+ Default = getDefaultCustomOperandEncoding(DepCtrInfo, DEP_CTR_SIZE, STI);
+ return Default;
+}
+
+bool isSymbolicDepCtrEncoding(unsigned Code, bool &HasNonDefaultVal,
+ const MCSubtargetInfo &STI) {
+ return isSymbolicCustomOperandEncoding(DepCtrInfo, DEP_CTR_SIZE, Code,
+ HasNonDefaultVal, STI);
+}
+
+bool decodeDepCtr(unsigned Code, int &Id, StringRef &Name, unsigned &Val,
+ bool &IsDefault, const MCSubtargetInfo &STI) {
+ return decodeCustomOperand(DepCtrInfo, DEP_CTR_SIZE, Code, Id, Name, Val,
+ IsDefault, STI);
+}
+
+int encodeDepCtr(const StringRef Name, int64_t Val, unsigned &UsedOprMask,
+ const MCSubtargetInfo &STI) {
+ return encodeCustomOperand(DepCtrInfo, DEP_CTR_SIZE, Name, Val, UsedOprMask,
+ STI);
+}
+
+} // namespace DepCtr
+
+//===----------------------------------------------------------------------===//
+// hwreg
+//===----------------------------------------------------------------------===//
+
+namespace Hwreg {
+
+int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) {
+ int Idx = getOprIdx<const MCSubtargetInfo &>(Name, Opr, OPR_SIZE, STI);
+ return (Idx < 0) ? Idx : Opr[Idx].Encoding;
}
bool isValidHwreg(int64_t Id) {
@@ -1063,7 +1291,8 @@ uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
}
StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
- return isValidHwreg(Id, STI) ? IdSymbolic[Id] : "";
+ int Idx = getOprIdx<const MCSubtargetInfo &>(Id, Opr, OPR_SIZE, STI);
+ return (Idx < 0) ? "" : Opr[Idx].Name;
}
void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) {
@@ -1087,12 +1316,13 @@ struct ExpTgt {
};
static constexpr ExpTgt ExpTgtInfo[] = {
- {{"null"}, ET_NULL, ET_NULL_MAX_IDX},
- {{"mrtz"}, ET_MRTZ, ET_MRTZ_MAX_IDX},
- {{"prim"}, ET_PRIM, ET_PRIM_MAX_IDX},
- {{"mrt"}, ET_MRT0, ET_MRT_MAX_IDX},
- {{"pos"}, ET_POS0, ET_POS_MAX_IDX},
- {{"param"}, ET_PARAM0, ET_PARAM_MAX_IDX},
+ {{"null"}, ET_NULL, ET_NULL_MAX_IDX},
+ {{"mrtz"}, ET_MRTZ, ET_MRTZ_MAX_IDX},
+ {{"prim"}, ET_PRIM, ET_PRIM_MAX_IDX},
+ {{"mrt"}, ET_MRT0, ET_MRT_MAX_IDX},
+ {{"pos"}, ET_POS0, ET_POS_MAX_IDX},
+ {{"dual_src_blend"}, ET_DUAL_SRC_BLEND0, ET_DUAL_SRC_BLEND_MAX_IDX},
+ {{"param"}, ET_PARAM0, ET_PARAM_MAX_IDX},
};
bool getTgtName(unsigned Id, StringRef &Name, int &Index) {
@@ -1130,7 +1360,20 @@ unsigned getTgtId(const StringRef Name) {
}
bool isSupportedTgtId(unsigned Id, const MCSubtargetInfo &STI) {
- return (Id != ET_POS4 && Id != ET_PRIM) || isGFX10Plus(STI);
+ switch (Id) {
+ case ET_NULL:
+ return !isGFX11Plus(STI);
+ case ET_POS4:
+ case ET_PRIM:
+ return isGFX10Plus(STI);
+ case ET_DUAL_SRC_BLEND0:
+ case ET_DUAL_SRC_BLEND1:
+ return isGFX11Plus(STI);
+ default:
+ if (Id >= ET_PARAM0 && Id <= ET_PARAM31)
+ return !isGFX11Plus(STI);
+ return true;
+ }
}
} // namespace Exp
@@ -1196,27 +1439,44 @@ void decodeDfmtNfmt(unsigned Format, unsigned &Dfmt, unsigned &Nfmt) {
Nfmt = (Format >> NFMT_SHIFT) & NFMT_MASK;
}
-int64_t getUnifiedFormat(const StringRef Name) {
- for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) {
- if (Name == UfmtSymbolic[Id])
- return Id;
+int64_t getUnifiedFormat(const StringRef Name, const MCSubtargetInfo &STI) {
+ if (isGFX11Plus(STI)) {
+ for (int Id = UfmtGFX11::UFMT_FIRST; Id <= UfmtGFX11::UFMT_LAST; ++Id) {
+ if (Name == UfmtSymbolicGFX11[Id])
+ return Id;
+ }
+ } else {
+ for (int Id = UfmtGFX10::UFMT_FIRST; Id <= UfmtGFX10::UFMT_LAST; ++Id) {
+ if (Name == UfmtSymbolicGFX10[Id])
+ return Id;
+ }
}
return UFMT_UNDEF;
}
-StringRef getUnifiedFormatName(unsigned Id) {
- return isValidUnifiedFormat(Id) ? UfmtSymbolic[Id] : "";
+StringRef getUnifiedFormatName(unsigned Id, const MCSubtargetInfo &STI) {
+ if(isValidUnifiedFormat(Id, STI))
+ return isGFX10(STI) ? UfmtSymbolicGFX10[Id] : UfmtSymbolicGFX11[Id];
+ return "";
}
-bool isValidUnifiedFormat(unsigned Id) {
- return Id <= UFMT_LAST;
+bool isValidUnifiedFormat(unsigned Id, const MCSubtargetInfo &STI) {
+ return isGFX10(STI) ? Id <= UfmtGFX10::UFMT_LAST : Id <= UfmtGFX11::UFMT_LAST;
}
-int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt) {
+int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt,
+ const MCSubtargetInfo &STI) {
int64_t Fmt = encodeDfmtNfmt(Dfmt, Nfmt);
- for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) {
- if (Fmt == DfmtNfmt2UFmt[Id])
- return Id;
+ if (isGFX11Plus(STI)) {
+ for (int Id = UfmtGFX11::UFMT_FIRST; Id <= UfmtGFX11::UFMT_LAST; ++Id) {
+ if (Fmt == DfmtNfmt2UFmtGFX11[Id])
+ return Id;
+ }
+ } else {
+ for (int Id = UfmtGFX10::UFMT_FIRST; Id <= UfmtGFX10::UFMT_LAST; ++Id) {
+ if (Fmt == DfmtNfmt2UFmtGFX10[Id])
+ return Id;
+ }
}
return UFMT_UNDEF;
}
@@ -1239,40 +1499,22 @@ unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI) {
namespace SendMsg {
-int64_t getMsgId(const StringRef Name) {
- for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) {
- if (IdSymbolic[i] && Name == IdSymbolic[i])
- return i;
- }
- return ID_UNKNOWN_;
+static uint64_t getMsgIdMask(const MCSubtargetInfo &STI) {
+ return isGFX11Plus(STI) ? ID_MASK_GFX11Plus_ : ID_MASK_PreGFX11_;
}
-bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) {
- if (Strict) {
- switch (MsgId) {
- case ID_SAVEWAVE:
- return isVI(STI) || isGFX9Plus(STI);
- case ID_STALL_WAVE_GEN:
- case ID_HALT_WAVES:
- case ID_ORDERED_PS_DONE:
- case ID_GS_ALLOC_REQ:
- case ID_GET_DOORBELL:
- return isGFX9Plus(STI);
- case ID_EARLY_PRIM_DEALLOC:
- return isGFX9(STI);
- case ID_GET_DDID:
- return isGFX10Plus(STI);
- default:
- return 0 <= MsgId && MsgId < ID_GAPS_LAST_ && IdSymbolic[MsgId];
- }
- } else {
- return 0 <= MsgId && isUInt<ID_WIDTH_>(MsgId);
- }
+int64_t getMsgId(const StringRef Name, const MCSubtargetInfo &STI) {
+ int Idx = getOprIdx<const MCSubtargetInfo &>(Name, Msg, MSG_SIZE, STI);
+ return (Idx < 0) ? Idx : Msg[Idx].Encoding;
+}
+
+bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI) {
+ return (MsgId & ~(getMsgIdMask(STI))) == 0;
}
-StringRef getMsgName(int64_t MsgId) {
- assert(0 <= MsgId && MsgId < ID_GAPS_LAST_);
- return IdSymbolic[MsgId];
+StringRef getMsgName(int64_t MsgId, const MCSubtargetInfo &STI) {
+ int Idx = getOprIdx<const MCSubtargetInfo &>(MsgId, Msg, MSG_SIZE, STI);
+ return (Idx < 0) ? "" : Msg[Idx].Name;
}
int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {
@@ -1289,26 +1531,27 @@ int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {
bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,
bool Strict) {
- assert(isValidMsgId(MsgId, STI, Strict));
+ assert(isValidMsgId(MsgId, STI));
if (!Strict)
return 0 <= OpId && isUInt<OP_WIDTH_>(OpId);
- switch(MsgId)
- {
- case ID_GS:
- return (OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_) && OpId != OP_GS_NOP;
- case ID_GS_DONE:
- return OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_;
- case ID_SYSMSG:
+ if (MsgId == ID_SYSMSG)
return OP_SYS_FIRST_ <= OpId && OpId < OP_SYS_LAST_;
- default:
- return OpId == OP_NONE_;
+ if (!isGFX11Plus(STI)) {
+ switch (MsgId) {
+ case ID_GS_PreGFX11:
+ return (OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_) && OpId != OP_GS_NOP;
+ case ID_GS_DONE_PreGFX11:
+ return OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_;
+ }
}
+ return OpId == OP_NONE_;
}
-StringRef getMsgOpName(int64_t MsgId, int64_t OpId) {
- assert(msgRequiresOp(MsgId));
+StringRef getMsgOpName(int64_t MsgId, int64_t OpId,
+ const MCSubtargetInfo &STI) {
+ assert(msgRequiresOp(MsgId, STI));
return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId];
}
@@ -1319,42 +1562,48 @@ bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,
if (!Strict)
return 0 <= StreamId && isUInt<STREAM_ID_WIDTH_>(StreamId);
- switch(MsgId)
- {
- case ID_GS:
- return STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_;
- case ID_GS_DONE:
- return (OpId == OP_GS_NOP)?
- (StreamId == STREAM_ID_NONE_) :
- (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_);
- default:
- return StreamId == STREAM_ID_NONE_;
+ if (!isGFX11Plus(STI)) {
+ switch (MsgId) {
+ case ID_GS_PreGFX11:
+ return STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_;
+ case ID_GS_DONE_PreGFX11:
+ return (OpId == OP_GS_NOP) ?
+ (StreamId == STREAM_ID_NONE_) :
+ (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_);
+ }
}
+ return StreamId == STREAM_ID_NONE_;
}
-bool msgRequiresOp(int64_t MsgId) {
- return MsgId == ID_GS || MsgId == ID_GS_DONE || MsgId == ID_SYSMSG;
+bool msgRequiresOp(int64_t MsgId, const MCSubtargetInfo &STI) {
+ return MsgId == ID_SYSMSG ||
+ (!isGFX11Plus(STI) &&
+ (MsgId == ID_GS_PreGFX11 || MsgId == ID_GS_DONE_PreGFX11));
}
-bool msgSupportsStream(int64_t MsgId, int64_t OpId) {
- return (MsgId == ID_GS || MsgId == ID_GS_DONE) && OpId != OP_GS_NOP;
+bool msgSupportsStream(int64_t MsgId, int64_t OpId,
+ const MCSubtargetInfo &STI) {
+ return !isGFX11Plus(STI) &&
+ (MsgId == ID_GS_PreGFX11 || MsgId == ID_GS_DONE_PreGFX11) &&
+ OpId != OP_GS_NOP;
}
-void decodeMsg(unsigned Val,
- uint16_t &MsgId,
- uint16_t &OpId,
- uint16_t &StreamId) {
- MsgId = Val & ID_MASK_;
- OpId = (Val & OP_MASK_) >> OP_SHIFT_;
- StreamId = (Val & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_;
+void decodeMsg(unsigned Val, uint16_t &MsgId, uint16_t &OpId,
+ uint16_t &StreamId, const MCSubtargetInfo &STI) {
+ MsgId = Val & getMsgIdMask(STI);
+ if (isGFX11Plus(STI)) {
+ OpId = 0;
+ StreamId = 0;
+ } else {
+ OpId = (Val & OP_MASK_) >> OP_SHIFT_;
+ StreamId = (Val & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_;
+ }
}
uint64_t encodeMsg(uint64_t MsgId,
uint64_t OpId,
uint64_t StreamId) {
- return (MsgId << ID_SHIFT_) |
- (OpId << OP_SHIFT_) |
- (StreamId << STREAM_ID_SHIFT_);
+ return MsgId | (OpId << OP_SHIFT_) | (StreamId << STREAM_ID_SHIFT_);
}
} // namespace SendMsg
@@ -1427,6 +1676,10 @@ bool isModuleEntryFunctionCC(CallingConv::ID CC) {
}
}
+bool isKernelCC(const Function *Func) {
+ return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
+}
+
bool hasXNACK(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
}
@@ -1448,7 +1701,8 @@ bool hasG16(const MCSubtargetInfo &STI) {
}
bool hasPackedD16(const MCSubtargetInfo &STI) {
- return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem];
+ return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem] && !isCI(STI) &&
+ !isSI(STI);
}
bool isSI(const MCSubtargetInfo &STI) {
@@ -1467,6 +1721,18 @@ bool isGFX9(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
}
+bool isGFX9_GFX10(const MCSubtargetInfo &STI) {
+ return isGFX9(STI) || isGFX10(STI);
+}
+
+bool isGFX8_GFX9_GFX10(const MCSubtargetInfo &STI) {
+ return isVI(STI) || isGFX9(STI) || isGFX10(STI);
+}
+
+bool isGFX8Plus(const MCSubtargetInfo &STI) {
+ return isVI(STI) || isGFX9Plus(STI);
+}
+
bool isGFX9Plus(const MCSubtargetInfo &STI) {
return isGFX9(STI) || isGFX10Plus(STI);
}
@@ -1475,7 +1741,29 @@ bool isGFX10(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
}
-bool isGFX10Plus(const MCSubtargetInfo &STI) { return isGFX10(STI); }
+bool isGFX10Plus(const MCSubtargetInfo &STI) {
+ return isGFX10(STI) || isGFX11Plus(STI);
+}
+
+bool isGFX11(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX11];
+}
+
+bool isGFX11Plus(const MCSubtargetInfo &STI) {
+ return isGFX11(STI);
+}
+
+bool isNotGFX11Plus(const MCSubtargetInfo &STI) {
+ return !isGFX11Plus(STI);
+}
+
+bool isNotGFX10Plus(const MCSubtargetInfo &STI) {
+ return isSI(STI) || isCI(STI) || isVI(STI) || isGFX9(STI);
+}
+
+bool isGFX10Before1030(const MCSubtargetInfo &STI) {
+ return isGFX10(STI) && !AMDGPU::isGFX10_BEncoding(STI);
+}
bool isGCN3Encoding(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
@@ -1497,10 +1785,29 @@ bool isGFX90A(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
}
+bool isGFX940(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX940Insts];
+}
+
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
}
+bool hasMAIInsts(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureMAIInsts];
+}
+
+bool hasVOPD(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureVOPD];
+}
+
+int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR,
+ int32_t ArgNumVGPR) {
+ if (has90AInsts && ArgNumAGPR)
+ return alignTo(ArgNumVGPR, 4) + ArgNumAGPR;
+ return std::max(ArgNumVGPR, ArgNumAGPR);
+}
+
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
@@ -1508,13 +1815,6 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
Reg == AMDGPU::SCC;
}
-bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
- for (MCRegAliasIterator R(Reg0, TRI, true); R.isValid(); ++R) {
- if (*R == Reg1) return true;
- }
- return false;
-}
-
#define MAP_REG2REG \
using namespace AMDGPU; \
switch(Reg) { \
@@ -1554,6 +1854,9 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
CASE_VI_GFX9PLUS(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
CASE_VI_GFX9PLUS(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
CASE_VI_GFX9PLUS(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
+ CASE_GFXPRE11_GFX11PLUS(M0) \
+ CASE_GFXPRE11_GFX11PLUS(SGPR_NULL) \
+ CASE_GFXPRE11_GFX11PLUS_TO(SGPR_NULL64, SGPR_NULL) \
}
#define CASE_CI_VI(node) \
@@ -1563,6 +1866,12 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
#define CASE_VI_GFX9PLUS(node) \
case node: return isGFX9Plus(STI) ? node##_gfx9plus : node##_vi;
+#define CASE_GFXPRE11_GFX11PLUS(node) \
+ case node: return isGFX11Plus(STI) ? node##_gfx11plus : node##_gfxpre11;
+
+#define CASE_GFXPRE11_GFX11PLUS_TO(node, result) \
+ case node: return isGFX11Plus(STI) ? result##_gfx11plus : result##_gfxpre11;
+
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
if (STI.getTargetTriple().getArch() == Triple::r600)
return Reg;
@@ -1571,9 +1880,13 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
#undef CASE_CI_VI
#undef CASE_VI_GFX9PLUS
+#undef CASE_GFXPRE11_GFX11PLUS
+#undef CASE_GFXPRE11_GFX11PLUS_TO
#define CASE_CI_VI(node) case node##_ci: case node##_vi: return node;
#define CASE_VI_GFX9PLUS(node) case node##_vi: case node##_gfx9plus: return node;
+#define CASE_GFXPRE11_GFX11PLUS(node) case node##_gfx11plus: case node##_gfxpre11: return node;
+#define CASE_GFXPRE11_GFX11PLUS_TO(node, result)
unsigned mc2PseudoReg(unsigned Reg) {
MAP_REG2REG
@@ -1581,6 +1894,8 @@ unsigned mc2PseudoReg(unsigned Reg) {
#undef CASE_CI_VI
#undef CASE_VI_GFX9PLUS
+#undef CASE_GFXPRE11_GFX11PLUS
+#undef CASE_GFXPRE11_GFX11PLUS_TO
#undef MAP_REG2REG
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
@@ -1934,7 +2249,7 @@ Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
}
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST, bool Signed) {
- // Address offset is 12-bit signed for GFX10, 13-bit for GFX9.
+ // Address offset is 12-bit signed for GFX10, 13-bit for GFX9 and GFX11+.
if (AMDGPU::isGFX10(ST))
return Signed ? 12 : 11;
@@ -2029,7 +2344,8 @@ const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr);
#define GET_SourcesOfDivergence_IMPL
#define GET_Gfx9BufferFormat_IMPL
-#define GET_Gfx10PlusBufferFormat_IMPL
+#define GET_Gfx10BufferFormat_IMPL
+#define GET_Gfx11PlusBufferFormat_IMPL
#include "AMDGPUGenSearchableTables.inc"
} // end anonymous namespace
@@ -2042,16 +2358,20 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,
uint8_t NumFormat,
const MCSubtargetInfo &STI) {
- return isGFX10Plus(STI)
- ? getGfx10PlusBufferFormatInfo(BitsPerComp, NumComponents,
+ return isGFX11Plus(STI)
+ ? getGfx11PlusBufferFormatInfo(BitsPerComp, NumComponents,
NumFormat)
- : getGfx9BufferFormatInfo(BitsPerComp, NumComponents, NumFormat);
+ : isGFX10(STI) ? getGfx10BufferFormatInfo(BitsPerComp,
+ NumComponents, NumFormat)
+ : getGfx9BufferFormatInfo(BitsPerComp,
+ NumComponents, NumFormat);
}
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
const MCSubtargetInfo &STI) {
- return isGFX10Plus(STI) ? getGfx10PlusBufferFormatInfo(Format)
- : getGfx9BufferFormatInfo(Format);
+ return isGFX11Plus(STI) ? getGfx11PlusBufferFormatInfo(Format)
+ : isGFX10(STI) ? getGfx10BufferFormatInfo(Format)
+ : getGfx9BufferFormatInfo(Format);
}
} // namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 4516b511f3c8..dffeec10a14a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -50,10 +50,19 @@ bool isHsaAbiVersion4(const MCSubtargetInfo *STI);
/// \returns True if HSA OS ABI Version identification is 5,
/// false otherwise.
bool isHsaAbiVersion5(const MCSubtargetInfo *STI);
-/// \returns True if HSA OS ABI Version identification is 3 or 4,
+/// \returns True if HSA OS ABI Version identification is 3 and above,
/// false otherwise.
bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI);
+/// \returns The offset of the multigrid_sync_arg argument from implicitarg_ptr
+unsigned getMultigridSyncArgImplicitArgPosition();
+
+/// \returns The offset of the hostcall pointer argument from implicitarg_ptr
+unsigned getHostcallImplicitArgPosition();
+
+/// \returns Code object version.
+unsigned getAmdhsaCodeObjectVersion();
+
struct GcnBufferFormatInfo {
unsigned Format;
unsigned BitsPerComp;
@@ -62,12 +71,19 @@ struct GcnBufferFormatInfo {
unsigned DataFormat;
};
+struct MAIInstInfo {
+ uint16_t Opcode;
+ bool is_dgemm;
+ bool is_gfx940_xdl;
+};
+
#define GET_MIMGBaseOpcode_DECL
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
#define GET_MIMGLZMapping_DECL
#define GET_MIMGMIPMapping_DECL
#define GET_MIMGBiASMapping_DECL
+#define GET_MAIInstInfoTable_DECL
#include "AMDGPUGenSearchableTables.inc"
namespace IsaInfo {
@@ -352,6 +368,11 @@ struct MIMGG16MappingInfo {
LLVM_READONLY
const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L);
+struct WMMAOpcodeMappingInfo {
+ unsigned Opcode2Addr;
+ unsigned Opcode3Addr;
+};
+
LLVM_READONLY
const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP);
@@ -382,6 +403,7 @@ struct MIMGInfo {
uint8_t MIMGEncoding;
uint8_t VDataDwords;
uint8_t VAddrDwords;
+ uint8_t VAddrOperands;
};
LLVM_READONLY
@@ -439,6 +461,16 @@ LLVM_READONLY
bool getVOP3IsSingle(unsigned Opc);
LLVM_READONLY
+bool isVOPC64DPP(unsigned Opc);
+
+/// Returns true if MAI operation is a double precision GEMM.
+LLVM_READONLY
+bool getMAIIsDGEMM(unsigned Opc);
+
+LLVM_READONLY
+bool getMAIIsGFX940XDL(unsigned Opc);
+
+LLVM_READONLY
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,
uint8_t NumFormat,
@@ -450,6 +482,12 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
LLVM_READONLY
int getMCOpcode(uint16_t Opcode, unsigned Gen);
+LLVM_READONLY
+unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc);
+
+LLVM_READONLY
+unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc);
+
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
const MCSubtargetInfo *STI);
@@ -496,7 +534,7 @@ struct Waitcnt {
unsigned LgkmCnt = ~0u;
unsigned VsCnt = ~0u;
- Waitcnt() {}
+ Waitcnt() = default;
Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt)
: VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {}
@@ -555,11 +593,14 @@ unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt);
/// \p Lgkmcnt respectively.
///
/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
-/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9 only)
-/// \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14] (gfx9+ only)
-/// \p Expcnt = \p Waitcnt[6:4]
-/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10 only)
-/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10+ only)
+/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9)
+/// \p Vmcnt = \p Waitcnt[15:14,3:0] (gfx9,10)
+/// \p Vmcnt = \p Waitcnt[15:10] (gfx11+)
+/// \p Expcnt = \p Waitcnt[6:4] (pre-gfx11)
+/// \p Expcnt = \p Waitcnt[2:0] (gfx11+)
+/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10)
+/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10)
+/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11+)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
@@ -581,12 +622,15 @@ unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
/// \p Version.
///
/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
-/// Waitcnt[3:0] = \p Vmcnt (pre-gfx9 only)
-/// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9+ only)
-/// Waitcnt[6:4] = \p Expcnt
-/// Waitcnt[11:8] = \p Lgkmcnt (pre-gfx10 only)
-/// Waitcnt[13:8] = \p Lgkmcnt (gfx10+ only)
-/// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9+ only)
+/// Waitcnt[2:0] = \p Expcnt (gfx11+)
+/// Waitcnt[3:0] = \p Vmcnt (pre-gfx9)
+/// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9,10)
+/// Waitcnt[6:4] = \p Expcnt (pre-gfx11)
+/// Waitcnt[9:4] = \p Lgkmcnt (gfx11+)
+/// Waitcnt[11:8] = \p Lgkmcnt (pre-gfx10)
+/// Waitcnt[13:8] = \p Lgkmcnt (gfx10)
+/// Waitcnt[15:10] = \p Vmcnt (gfx11+)
+/// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9,10)
///
/// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
/// isa \p Version.
@@ -598,10 +642,7 @@ unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded);
namespace Hwreg {
LLVM_READONLY
-int64_t getHwregId(const StringRef Name);
-
-LLVM_READNONE
-bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI);
+int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI);
LLVM_READNONE
bool isValidHwreg(int64_t Id);
@@ -622,6 +663,18 @@ void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width);
} // namespace Hwreg
+namespace DepCtr {
+
+int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI);
+int encodeDepCtr(const StringRef Name, int64_t Val, unsigned &UsedOprMask,
+ const MCSubtargetInfo &STI);
+bool isSymbolicDepCtrEncoding(unsigned Code, bool &HasNonDefaultVal,
+ const MCSubtargetInfo &STI);
+bool decodeDepCtr(unsigned Code, int &Id, StringRef &Name, unsigned &Val,
+ bool &IsDefault, const MCSubtargetInfo &STI);
+
+} // namespace DepCtr
+
namespace Exp {
bool getTgtName(unsigned Id, StringRef &Name, int &Index);
@@ -653,13 +706,14 @@ bool isValidDfmtNfmt(unsigned Val, const MCSubtargetInfo &STI);
bool isValidNfmt(unsigned Val, const MCSubtargetInfo &STI);
-int64_t getUnifiedFormat(const StringRef Name);
+int64_t getUnifiedFormat(const StringRef Name, const MCSubtargetInfo &STI);
-StringRef getUnifiedFormatName(unsigned Id);
+StringRef getUnifiedFormatName(unsigned Id, const MCSubtargetInfo &STI);
-bool isValidUnifiedFormat(unsigned Val);
+bool isValidUnifiedFormat(unsigned Val, const MCSubtargetInfo &STI);
-int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt);
+int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt,
+ const MCSubtargetInfo &STI);
bool isValidFormatEncoding(unsigned Val, const MCSubtargetInfo &STI);
@@ -670,19 +724,19 @@ unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI);
namespace SendMsg {
LLVM_READONLY
-int64_t getMsgId(const StringRef Name);
+int64_t getMsgId(const StringRef Name, const MCSubtargetInfo &STI);
LLVM_READONLY
int64_t getMsgOpId(int64_t MsgId, const StringRef Name);
LLVM_READNONE
-StringRef getMsgName(int64_t MsgId);
+StringRef getMsgName(int64_t MsgId, const MCSubtargetInfo &STI);
LLVM_READNONE
-StringRef getMsgOpName(int64_t MsgId, int64_t OpId);
+StringRef getMsgOpName(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI);
LLVM_READNONE
-bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict = true);
+bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI);
LLVM_READNONE
bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,
@@ -693,15 +747,13 @@ bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,
const MCSubtargetInfo &STI, bool Strict = true);
LLVM_READNONE
-bool msgRequiresOp(int64_t MsgId);
+bool msgRequiresOp(int64_t MsgId, const MCSubtargetInfo &STI);
LLVM_READNONE
-bool msgSupportsStream(int64_t MsgId, int64_t OpId);
+bool msgSupportsStream(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI);
-void decodeMsg(unsigned Val,
- uint16_t &MsgId,
- uint16_t &OpId,
- uint16_t &StreamId);
+void decodeMsg(unsigned Val, uint16_t &MsgId, uint16_t &OpId,
+ uint16_t &StreamId, const MCSubtargetInfo &STI);
LLVM_READNONE
uint64_t encodeMsg(uint64_t MsgId,
@@ -738,6 +790,8 @@ bool isEntryFunctionCC(CallingConv::ID CC);
LLVM_READNONE
bool isModuleEntryFunctionCC(CallingConv::ID CC);
+bool isKernelCC(const Function *Func);
+
// FIXME: Remove this when calling conventions cleaned up
LLVM_READNONE
inline bool isKernel(CallingConv::ID CC) {
@@ -761,22 +815,31 @@ bool isSI(const MCSubtargetInfo &STI);
bool isCI(const MCSubtargetInfo &STI);
bool isVI(const MCSubtargetInfo &STI);
bool isGFX9(const MCSubtargetInfo &STI);
+bool isGFX9_GFX10(const MCSubtargetInfo &STI);
+bool isGFX8_GFX9_GFX10(const MCSubtargetInfo &STI);
+bool isGFX8Plus(const MCSubtargetInfo &STI);
bool isGFX9Plus(const MCSubtargetInfo &STI);
bool isGFX10(const MCSubtargetInfo &STI);
bool isGFX10Plus(const MCSubtargetInfo &STI);
+bool isNotGFX10Plus(const MCSubtargetInfo &STI);
+bool isGFX10Before1030(const MCSubtargetInfo &STI);
+bool isGFX11(const MCSubtargetInfo &STI);
+bool isGFX11Plus(const MCSubtargetInfo &STI);
+bool isNotGFX11Plus(const MCSubtargetInfo &STI);
bool isGCN3Encoding(const MCSubtargetInfo &STI);
bool isGFX10_AEncoding(const MCSubtargetInfo &STI);
bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
bool isGFX90A(const MCSubtargetInfo &STI);
+bool isGFX940(const MCSubtargetInfo &STI);
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
+bool hasMAIInsts(const MCSubtargetInfo &STI);
+bool hasVOPD(const MCSubtargetInfo &STI);
+int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
-/// Is there any intersection between registers
-bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI);
-
/// If \p Reg is a pseudo reg, return the correct hardware register given
/// \p STI otherwise return \p Reg.
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
@@ -931,7 +994,7 @@ inline bool isLegal64BitDPPControl(unsigned DC) {
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
-// Track defaults for fields in the MODE registser.
+// Track defaults for fields in the MODE register.
struct SIModeRegisterDefaults {
/// Floating point opcodes that support exception flag gathering quiet and
/// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
deleted file mode 100644
index 83ef68cc3f60..000000000000
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===- AMDGPULDSUtils.h - LDS related helper functions -*- C++ -*----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// AMDGPU LDS related helper utility functions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
-#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/IR/Constants.h"
-
-namespace llvm {
-
-class ConstantExpr;
-
-namespace AMDGPU {
-
-bool isKernelCC(const Function *Func);
-
-Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
-
-std::vector<GlobalVariable *> findVariablesToLower(Module &M,
- const Function *F = nullptr);
-
-/// Replace all uses of constant \p C with instructions in \p F.
-void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
-} // end namespace AMDGPU
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index a83ff6667956..83d7cbdb183c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -1,33 +1,32 @@
-//===- AMDGPULDSUtils.cpp -------------------------------------------------===//
+//===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-//
-// AMDGPU LDS related helper utility functions.
-//
-//===----------------------------------------------------------------------===//
-#include "AMDGPULDSUtils.h"
+#include "AMDGPUMemoryUtils.h"
#include "AMDGPU.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/DepthFirstIterator.h"
+#include "AMDGPUBaseInfo.h"
#include "llvm/ADT/SetVector.h"
-#include "llvm/IR/Constants.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/ReplaceConstant.h"
+#define DEBUG_TYPE "amdgpu-memory-utils"
+
using namespace llvm;
namespace llvm {
namespace AMDGPU {
-bool isKernelCC(const Function *Func) {
- return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
-}
-
Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
GV->getValueType());
@@ -139,6 +138,83 @@ std::vector<GlobalVariable *> findVariablesToLower(Module &M,
return LocalVars;
}
+bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
+ Instruction *DefInst = Def->getMemoryInst();
+
+ if (isa<FenceInst>(DefInst))
+ return false;
+
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::amdgcn_s_barrier:
+ case Intrinsic::amdgcn_wave_barrier:
+ case Intrinsic::amdgcn_sched_barrier:
+ return false;
+ default:
+ break;
+ }
+ }
+
+ // Ignore atomics not aliasing with the original load, any atomic is a
+ // universal MemoryDef from MSSA's point of view too, just like a fence.
+ const auto checkNoAlias = [AA, Ptr](auto I) -> bool {
+ return I && AA->isNoAlias(I->getPointerOperand(), Ptr);
+ };
+
+ if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
+ checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
+ return false;
+
+ return true;
+}
+
+bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
+ AAResults *AA) {
+ MemorySSAWalker *Walker = MSSA->getWalker();
+ SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
+ SmallSet<MemoryAccess *, 8> Visited;
+ MemoryLocation Loc(MemoryLocation::get(Load));
+
+ LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
+
+ // Start with a nearest dominating clobbering access, it will be either
+ // live on entry (nothing to do, load is not clobbered), MemoryDef, or
+ // MemoryPhi if several MemoryDefs can define this memory state. In that
+ // case add all Defs to WorkList and continue going up and checking all
+ // the definitions of this memory location until the root. When all the
+ // defs are exhausted and came to the entry state we have no clobber.
+ // Along the scan ignore barriers and fences which are considered clobbers
+ // by the MemorySSA, but not really writing anything into the memory.
+ while (!WorkList.empty()) {
+ MemoryAccess *MA = WorkList.pop_back_val();
+ if (!Visited.insert(MA).second)
+ continue;
+
+ if (MSSA->isLiveOnEntryDef(MA))
+ continue;
+
+ if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
+ LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n');
+
+ if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) {
+ LLVM_DEBUG(dbgs() << " -> load is clobbered\n");
+ return true;
+ }
+
+ WorkList.push_back(
+ Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
+ continue;
+ }
+
+ const MemoryPhi *Phi = cast<MemoryPhi>(MA);
+ for (auto &Use : Phi->incoming_values())
+ WorkList.push_back(cast<MemoryAccess>(&Use));
+ }
+
+ LLVM_DEBUG(dbgs() << " -> no clobber\n");
+ return false;
+}
+
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
new file mode 100644
index 000000000000..65ed02ca62de
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -0,0 +1,51 @@
+//===- AMDGPUMemoryUtils.h - Memory related helper functions -*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
+
+#include <vector>
+
+namespace llvm {
+
+struct Align;
+class AAResults;
+class ConstantExpr;
+class DataLayout;
+class Function;
+class GlobalVariable;
+class LoadInst;
+class MemoryDef;
+class MemorySSA;
+class Module;
+class Value;
+
+namespace AMDGPU {
+
+Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
+
+std::vector<GlobalVariable *> findVariablesToLower(Module &M,
+ const Function *F = nullptr);
+
+/// Replace all uses of constant \p C with instructions in \p F.
+void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
+
+/// Given a \p Def clobbering a load from \p Ptr according to the MSSA check
+/// if this is actually a memory update or an artificial clobber to facilitate
+/// ordering constraints.
+bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA);
+
+/// Check is a \p Load is clobbered in its function.
+bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
+ AAResults *AA);
+
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index f6b5975f1934..4ad93f7b0b68 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -209,6 +209,11 @@ void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, unsigned Val) {
getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val);
}
+// Set the number of used agprs in the metadata.
+void AMDGPUPALMetadata::setNumUsedAgprs(CallingConv::ID CC, unsigned Val) {
+ getHwStage(CC)[".agpr_count"] = Val;
+}
+
// Set the number of used sgprs in the metadata. This is an optional advisory
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of sgprs to allocate.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 7fdd9a8429c1..a45a799e38a9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -69,6 +69,10 @@ public:
// the shader stage to determine the number of vgprs to allocate.
void setNumUsedVgprs(unsigned CC, unsigned Val);
+ // Set the number of used agprs in the metadata. This is an optional advisory
+ // record for logging etc;
+ void setNumUsedAgprs(unsigned CC, unsigned Val);
+
// Set the number of used sgprs in the metadata. This is an optional advisory
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of sgprs to allocate.
diff --git a/llvm/lib/Target/AMDGPU/VIInstrFormats.td b/llvm/lib/Target/AMDGPU/VIInstrFormats.td
index bd65a495fa72..7393ef6c2a2d 100644
--- a/llvm/lib/Target/AMDGPU/VIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/VIInstrFormats.td
@@ -10,7 +10,7 @@
//
//===----------------------------------------------------------------------===//
-class EXPe_vi : EXPe {
+class EXPe_vi : EXPe_ComprVM {
let Inst{31-26} = 0x31; //encoding
}
diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
new file mode 100644
index 000000000000..c63fbbc241d9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -0,0 +1,180 @@
+//===-- VINTERPInstructions.td - VINTERP Instruction Definitions ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VINTERP encoding
+//===----------------------------------------------------------------------===//
+
+class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : Enc64 {
+ bits<8> vdst;
+ bits<4> src0_modifiers;
+ bits<9> src0;
+ bits<3> src1_modifiers;
+ bits<9> src1;
+ bits<3> src2_modifiers;
+ bits<9> src2;
+ bits<1> clamp;
+ bits<3> waitexp;
+
+ let Inst{31-26} = 0x33; // VOP3P encoding
+ let Inst{25-24} = 0x1; // VINTERP sub-encoding
+ let Inst{23} = 0; // reserved
+
+ let Inst{7-0} = vdst;
+ let Inst{10-8} = waitexp;
+ let Inst{11} = !if(P.HasOpSel, src0_modifiers{2}, 0); // op_sel(0)
+ let Inst{12} = !if(P.HasOpSel, src1_modifiers{2}, 0); // op_sel(1)
+ let Inst{13} = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2)
+ let Inst{14} = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel(3)
+ let Inst{15} = clamp;
+ let Inst{22-16} = op;
+ let Inst{40-32} = src0;
+ let Inst{49-41} = src1;
+ let Inst{58-50} = src2;
+ let Inst{61} = src0_modifiers{0}; // neg(0)
+ let Inst{62} = src1_modifiers{0}; // neg(1)
+ let Inst{63} = src2_modifiers{0}; // neg(2)
+}
+
+//===----------------------------------------------------------------------===//
+// VOP3 VINTERP
+//===----------------------------------------------------------------------===//
+
+class VINTERP_Pseudo <string OpName, VOPProfile P, list<dag> pattern = []> :
+ VOP3_Pseudo<OpName, P, pattern, 0, 0> {
+ let AsmMatchConverter = "cvtVINTERP";
+ let mayRaiseFPException = 0;
+
+ let VOP3_OPSEL = 1;
+ let VINTERP = 1;
+}
+
+class VINTERP_Real <VOP_Pseudo ps, int EncodingFamily> :
+ VOP3_Real <ps, EncodingFamily> {
+ let VINTERP = 1;
+}
+
+def VOP3_VINTERP_F32 : VOPProfile<[f32, f32, f32, f32]> {
+ let HasOpSel = 0;
+ let HasModifiers = 1;
+
+ let Outs64 = (outs VGPR_32:$vdst);
+ let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
+ Src1Mod:$src1_modifiers, VRegSrc_32:$src1,
+ Src2Mod:$src2_modifiers, VRegSrc_32:$src2,
+ clampmod:$clamp,
+ wait_exp:$waitexp);
+
+ let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$waitexp";
+}
+
+class VOP3_VINTERP_F16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
+ let HasOpSel = 1;
+ let HasModifiers = 1;
+
+ let Outs64 = (outs VGPR_32:$vdst);
+ let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
+ Src1Mod:$src1_modifiers, VRegSrc_32:$src1,
+ Src2Mod:$src2_modifiers, VRegSrc_32:$src2,
+ clampmod:$clamp, op_sel0:$op_sel,
+ wait_exp:$waitexp);
+
+ let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$op_sel$waitexp";
+}
+
+//===----------------------------------------------------------------------===//
+// VINTERP Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isGFX11Plus in {
+
+let Uses = [M0, EXEC, MODE] in {
+def V_INTERP_P10_F32_inreg : VINTERP_Pseudo <"v_interp_p10_f32", VOP3_VINTERP_F32>;
+def V_INTERP_P2_F32_inreg : VINTERP_Pseudo <"v_interp_p2_f32", VOP3_VINTERP_F32>;
+def V_INTERP_P10_F16_F32_inreg :
+ VINTERP_Pseudo <"v_interp_p10_f16_f32", VOP3_VINTERP_F16<[f32, f32, f32, f32]>>;
+def V_INTERP_P2_F16_F32_inreg :
+ VINTERP_Pseudo <"v_interp_p2_f16_f32", VOP3_VINTERP_F16<[f16, f32, f32, f32]>>;
+} // Uses = [M0, EXEC, MODE]
+
+let Uses = [M0, EXEC] in {
+def V_INTERP_P10_RTZ_F16_F32_inreg :
+ VINTERP_Pseudo <"v_interp_p10_rtz_f16_f32", VOP3_VINTERP_F16<[f32, f32, f32, f32]>>;
+def V_INTERP_P2_RTZ_F16_F32_inreg :
+ VINTERP_Pseudo <"v_interp_p2_rtz_f16_f32", VOP3_VINTERP_F16<[f16, f32, f32, f32]>>;
+} // Uses = [M0, EXEC]
+
+} // SubtargetPredicate = isGFX11Plus
+
+class VInterpF32Pat <SDPatternOperator op, Instruction inst> : GCNPat <
+ (f32 (op
+ (VINTERPMods f32:$src0, i32:$src0_modifiers),
+ (VINTERPMods f32:$src1, i32:$src1_modifiers),
+ (VINTERPMods f32:$src2, i32:$src2_modifiers))),
+ (inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ 0, /* clamp */
+ 7) /* wait_exp */
+>;
+
+def VINTERP_OPSEL {
+ int LOW = 0;
+ int HIGH = 0xa;
+}
+
+class VInterpF16Pat <SDPatternOperator op, Instruction inst,
+ ValueType dst_type, bit high,
+ list<ComplexPattern> pat> : GCNPat <
+ (dst_type (op
+ (pat[0] f32:$src0, i32:$src0_modifiers),
+ (pat[1] f32:$src1, i32:$src1_modifiers),
+ (pat[2] f32:$src2, i32:$src2_modifiers),
+ !if(high, (i1 -1), (i1 0)))),
+ (inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ 0, /* clamp */
+ /* op_sel = 0 */
+ 7) /* wait_exp */
+>;
+
+multiclass VInterpF16Pat <SDPatternOperator op, Instruction inst,
+ ValueType dst_type, list<ComplexPattern> high_pat> {
+ def : VInterpF16Pat<op, inst, dst_type, 0,
+ [VINTERPMods, VINTERPMods, VINTERPMods]>;
+ def : VInterpF16Pat<op, inst, dst_type, 1, high_pat>;
+}
+
+def : VInterpF32Pat<int_amdgcn_interp_inreg_p10, V_INTERP_P10_F32_inreg>;
+def : VInterpF32Pat<int_amdgcn_interp_inreg_p2, V_INTERP_P2_F32_inreg>;
+defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_f16,
+ V_INTERP_P10_F16_F32_inreg, f32,
+ [VINTERPModsHi, VINTERPMods, VINTERPModsHi]>;
+defm : VInterpF16Pat<int_amdgcn_interp_inreg_p2_f16,
+ V_INTERP_P2_F16_F32_inreg, f16,
+ [VINTERPModsHi, VINTERPMods, VINTERPMods]>;
+
+//===----------------------------------------------------------------------===//
+// VINTERP Real Instructions
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" in {
+ multiclass VINTERP_Real_gfx11 <bits<7> op> {
+ def _gfx11 :
+ VINTERP_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX11>,
+ VINTERPe_gfx11<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ }
+}
+
+defm V_INTERP_P10_F32_inreg : VINTERP_Real_gfx11<0x000>;
+defm V_INTERP_P2_F32_inreg : VINTERP_Real_gfx11<0x001>;
+defm V_INTERP_P10_F16_F32_inreg : VINTERP_Real_gfx11<0x002>;
+defm V_INTERP_P2_F16_F32_inreg : VINTERP_Real_gfx11<0x003>;
+defm V_INTERP_P10_RTZ_F16_F32_inreg : VINTERP_Real_gfx11<0x004>;
+defm V_INTERP_P2_RTZ_F16_F32_inreg : VINTERP_Real_gfx11<0x005>;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 48548d8b6722..1d374a9f90ba 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -59,9 +59,9 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On
let AsmVariantName = AMDGPUAsmVariants.Default;
}
-class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
+class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemonic > :
VOP_Real <ps>,
- InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
let VALU = 1;
@@ -110,13 +110,18 @@ class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
}
multiclass VOP1Inst <string opName, VOPProfile P,
- SDPatternOperator node = null_frag> {
+ SDPatternOperator node = null_frag, int VOPDOp = -1> {
// We only want to set this on the basic, non-SDWA or DPP forms.
- defvar should_mov_imm = !eq(opName, "v_mov_b32");
+ defvar should_mov_imm = !or(!eq(opName, "v_mov_b32"),
+ !eq(opName, "v_mov_b64"));
let isMoveImm = should_mov_imm in {
- def _e32 : VOP1_Pseudo <opName, P>;
- def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
+ if !eq(VOPDOp, -1) then
+ def _e32 : VOP1_Pseudo <opName, P>;
+ else
+ // Only for V_MOV_B32
+ def _e32 : VOP1_Pseudo <opName, P>, VOPD_Component<VOPDOp, "v_mov_b32">;
+ def _e64 : VOP3InstBase <opName, P, node>;
}
foreach _ = BoolToList<P.HasExtSDWA>.ret in
@@ -125,6 +130,11 @@ multiclass VOP1Inst <string opName, VOPProfile P,
foreach _ = BoolToList<P.HasExtDPP>.ret in
def _dpp : VOP1_DPP_Pseudo <opName, P>;
+ let SubtargetPredicate = isGFX11Plus in {
+ foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+ def _e64_dpp : VOP3_DPP_Pseudo <opName, P>;
+ } // End SubtargetPredicate = isGFX11Plus
+
def : MnemonicAlias<opName#"_e32", opName>, LetDummies;
def : MnemonicAlias<opName#"_e64", opName>, LetDummies;
@@ -141,7 +151,9 @@ class VOPProfileI2F<ValueType dstVt, ValueType srcVt> :
VOPProfile<[dstVt, srcVt, untyped, untyped]> {
let Ins64 = (ins Src0RC64:$src0, clampmod:$clamp, omod:$omod);
+ let InsVOP3Base = (ins Src0DPP:$src0, clampmod:$clamp, omod:$omod);
let Asm64 = "$vdst, $src0$clamp$omod";
+ let AsmVOP3DPPBase = Asm64;
let HasModifiers = 0;
let HasClamp = 1;
@@ -151,6 +163,12 @@ def VOP1_F64_I32 : VOPProfileI2F <f64, i32>;
def VOP1_F32_I32 : VOPProfileI2F <f32, i32>;
def VOP1_F16_I16 : VOPProfileI2F <f16, i16>;
+def VOP_NOP_PROFILE : VOPProfile <[untyped, untyped, untyped, untyped]>{
+ let HasExtVOP3DPP = 0;
+}
+
+// OMod clears exceptions when set. OMod was always an operand, but its
+// now explicitly set.
class VOP_SPECIAL_OMOD_PROF<ValueType dstVt, ValueType srcVt> :
VOPProfile<[dstVt, srcVt, untyped, untyped]> {
@@ -165,11 +183,21 @@ def VOP_I16_F16_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i16, f16>;
//===----------------------------------------------------------------------===//
let VOPAsmPrefer32Bit = 1 in {
-defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>;
+defm V_NOP : VOP1Inst <"v_nop", VOP_NOP_PROFILE>;
+}
+
+def VOPProfile_MOV : VOPProfile <[i32, i32, untyped, untyped]> {
+ let InsVOPDX = (ins Src0RC32:$src0X);
+ let InsVOPDXDeferred = (ins VSrc_f32_Deferred:$src0X);
+ let InsVOPDY = (ins Src0RC32:$src0Y);
+ let InsVOPDYDeferred = (ins VSrc_f32_Deferred:$src0Y);
}
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>;
+defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOPProfile_MOV, null_frag, 0x8>;
+
+let SubtargetPredicate = isGFX940Plus in
+defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
} // End isMoveImm = 1
// FIXME: Specify SchedRW for READFIRSTLANE_B32
@@ -282,7 +310,7 @@ defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
-defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>;
+defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, DivergentUnaryFrag<bitreverse>>;
defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>;
defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>;
defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>;
@@ -472,7 +500,7 @@ let SubtargetPredicate = isGFX9Only in {
} // End SubtargetPredicate = isGFX9Only
let SubtargetPredicate = isGFX10Plus in {
- defm V_PIPEFLUSH : VOP1Inst<"v_pipeflush", VOP_NONE>;
+ defm V_PIPEFLUSH : VOP1Inst<"v_pipeflush", VOP_NO_EXT<VOP_NONE>>;
let Uses = [M0] in {
defm V_MOVRELSD_2_B32 :
@@ -498,6 +526,17 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
let isAsCheapAsAMove = 1;
}
+let SubtargetPredicate = isGFX11Plus in {
+ // Restrict src0 to be VGPR
+ def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
+ getVOP1Pat64<int_amdgcn_permlane64,
+ VOP_MOVRELS>.ret,
+ /*VOP1Only=*/ 1>;
+ defm V_NOT_B16 : VOP1Inst<"v_not_b16", VOP_I16_I16>;
+ defm V_CVT_I32_I16 : VOP1Inst<"v_cvt_i32_i16", VOP_I32_I16>;
+ defm V_CVT_U32_U16 : VOP1Inst<"v_cvt_u32_u16", VOP_I16_I16>;
+} // End SubtargetPredicate = isGFX11Plus
+
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
@@ -517,9 +556,9 @@ class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP1
let Inst{31-25} = 0x3f;
}
-class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl> :
+class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, int subtarget, VOPProfile p = ps.Pfl> :
VOP1_DPP<op, ps, p, 1>,
- SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10> {
+ SIMCInstr <ps.PseudoInstr, subtarget> {
let AssemblerPredicate = HasDPP16;
let SubtargetPredicate = HasDPP16;
}
@@ -539,10 +578,112 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
}
//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
+ multiclass VOP1Only_Real_gfx11<bits<9> op> {
+ let IsSingle = 1 in
+ def _gfx11 :
+ VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.GFX11>,
+ VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP1_Real_e32_gfx11<bits<9> op, string opName = NAME> {
+ defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+ def _e32_gfx11 :
+ VOP1_Real<ps, SIEncodingFamily.GFX11>,
+ VOP1e<op{7-0}, ps.Pfl>;
+ }
+ multiclass VOP1_Real_e32_with_name_gfx11<bits<9> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+ let AsmString = asmName # ps.AsmOperands in {
+ defm NAME : VOP1_Real_e32_gfx11<op, opName>,
+ MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>;
+ }
+ }
+ multiclass VOP1_Real_e64_gfx11<bits<9> op> {
+ def _e64_gfx11 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX11>,
+ VOP3e_gfx11<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+ multiclass VOP1_Real_dpp_gfx11<bits<9> op, string opName = NAME> {
+ defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+ def _dpp_gfx11 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX11> {
+ let DecoderNamespace = "DPPGFX11";
+ }
+ }
+ multiclass VOP1_Real_dpp_with_name_gfx11<bits<9> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+ let AsmString = asmName # ps.Pfl.AsmDPP16, DecoderNamespace = "DPPGFX11" in {
+ defm NAME : VOP1_Real_dpp_gfx11<op, opName>,
+ MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>;
+ }
+ }
+ multiclass VOP1_Real_dpp8_gfx11<bits<9> op, string opName = NAME> {
+ defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+ def _dpp8_gfx11 : VOP1_DPP8<op{7-0}, ps> {
+ let DecoderNamespace = "DPP8GFX11";
+ }
+ }
+ multiclass VOP1_Real_dpp8_with_name_gfx11<bits<9> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+ let AsmString = asmName # ps.Pfl.AsmDPP8, DecoderNamespace = "DPP8GFX11" in {
+ defm NAME : VOP1_Real_dpp8_gfx11<op, opName>,
+ MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>;
+ }
+ }
+} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11"
+
+multiclass VOP1_Realtriple_e64_gfx11<bits<9> op> {
+ defm NAME : VOP3_Realtriple_gfx11<{0, 1, 1, op{6-0}}, /*isSingle=*/ 0, NAME>;
+}
+multiclass VOP1_Realtriple_e64_with_name_gfx11<bits<9> op, string opName,
+ string asmName> {
+ defm NAME : VOP3_Realtriple_with_name_gfx11<{0, 1, 1, op{6-0}}, opName,
+ asmName>;
+}
+
+multiclass VOP1_Real_FULL_gfx11<bits<9> op> :
+ VOP1_Real_e32_gfx11<op>, VOP1_Realtriple_e64_gfx11<op>,
+ VOP1_Real_dpp_gfx11<op>, VOP1_Real_dpp8_gfx11<op>;
+
+multiclass VOP1_Real_NO_VOP3_with_name_gfx11<bits<9> op, string opName,
+ string asmName> :
+ VOP1_Real_e32_with_name_gfx11<op, opName, asmName>,
+ VOP1_Real_dpp_with_name_gfx11<op, opName, asmName>,
+ VOP1_Real_dpp8_with_name_gfx11<op, opName, asmName>;
+
+multiclass VOP1_Real_FULL_with_name_gfx11<bits<9> op, string opName,
+ string asmName> :
+ VOP1_Real_NO_VOP3_with_name_gfx11<op, opName, asmName>,
+ VOP1_Realtriple_e64_with_name_gfx11<op, opName, asmName>;
+
+multiclass VOP1_Real_NO_DPP_gfx11<bits<9> op> :
+ VOP1_Real_e32_gfx11<op>, VOP1_Real_e64_gfx11<op>;
+
+defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11<0x00c,
+ "V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">;
+defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11<0x00d,
+ "V_CVT_FLR_I32_F32", "v_cvt_floor_i32_f32">;
+defm V_CLZ_I32_U32 : VOP1_Real_FULL_with_name_gfx11<0x039,
+ "V_FFBH_U32", "v_clz_i32_u32">;
+defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11<0x03a,
+ "V_FFBL_B32", "v_ctz_i32_b32">;
+defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11<0x03b,
+ "V_FFBH_I32", "v_cls_i32">;
+defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11<0x067>;
+defm V_NOT_B16 : VOP1_Real_FULL_gfx11<0x069>;
+defm V_CVT_I32_I16 : VOP1_Real_FULL_gfx11<0x06a>;
+defm V_CVT_U32_U16 : VOP1_Real_FULL_gfx11<0x06b>;
+
+//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
multiclass VOP1Only_Real_gfx10<bits<9> op> {
def _gfx10 :
VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.GFX10>,
@@ -567,50 +708,59 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
}
multiclass VOP1_Real_dpp_gfx10<bits<9> op> {
- foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
- def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")> {
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in
+ def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX10> {
let DecoderNamespace = "SDWA10";
}
}
multiclass VOP1_Real_dpp8_gfx10<bits<9> op> {
- foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in
def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
let DecoderNamespace = "DPP8";
}
}
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
multiclass VOP1_Real_gfx10<bits<9> op> :
VOP1_Real_e32_gfx10<op>, VOP1_Real_e64_gfx10<op>,
VOP1_Real_sdwa_gfx10<op>, VOP1_Real_dpp_gfx10<op>,
VOP1_Real_dpp8_gfx10<op>;
-defm V_PIPEFLUSH : VOP1_Real_gfx10<0x01b>;
-defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10<0x048>;
-defm V_CVT_F16_U16 : VOP1_Real_gfx10<0x050>;
-defm V_CVT_F16_I16 : VOP1_Real_gfx10<0x051>;
-defm V_CVT_U16_F16 : VOP1_Real_gfx10<0x052>;
-defm V_CVT_I16_F16 : VOP1_Real_gfx10<0x053>;
-defm V_RCP_F16 : VOP1_Real_gfx10<0x054>;
-defm V_SQRT_F16 : VOP1_Real_gfx10<0x055>;
-defm V_RSQ_F16 : VOP1_Real_gfx10<0x056>;
-defm V_LOG_F16 : VOP1_Real_gfx10<0x057>;
-defm V_EXP_F16 : VOP1_Real_gfx10<0x058>;
-defm V_FREXP_MANT_F16 : VOP1_Real_gfx10<0x059>;
-defm V_FREXP_EXP_I16_F16 : VOP1_Real_gfx10<0x05a>;
-defm V_FLOOR_F16 : VOP1_Real_gfx10<0x05b>;
-defm V_CEIL_F16 : VOP1_Real_gfx10<0x05c>;
-defm V_TRUNC_F16 : VOP1_Real_gfx10<0x05d>;
-defm V_RNDNE_F16 : VOP1_Real_gfx10<0x05e>;
-defm V_FRACT_F16 : VOP1_Real_gfx10<0x05f>;
-defm V_SIN_F16 : VOP1_Real_gfx10<0x060>;
-defm V_COS_F16 : VOP1_Real_gfx10<0x061>;
-defm V_SAT_PK_U8_I16 : VOP1_Real_gfx10<0x062>;
-defm V_CVT_NORM_I16_F16 : VOP1_Real_gfx10<0x063>;
-defm V_CVT_NORM_U16_F16 : VOP1_Real_gfx10<0x064>;
+multiclass VOP1_Real_gfx10_FULL_gfx11<bits<9> op> :
+ VOP1_Real_gfx10<op>, VOP1_Real_FULL_gfx11<op>;
+
+multiclass VOP1_Real_gfx10_NO_DPP_gfx11<bits<9> op> :
+ VOP1_Real_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>;
-defm V_SWAP_B32 : VOP1Only_Real_gfx10<0x065>;
-defm V_SWAPREL_B32 : VOP1Only_Real_gfx10<0x068>;
+multiclass VOP1Only_Real_gfx10_gfx11<bits<9> op> :
+ VOP1Only_Real_gfx10<op>, VOP1Only_Real_gfx11<op>;
+
+defm V_PIPEFLUSH : VOP1_Real_gfx10_NO_DPP_gfx11<0x01b>;
+defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10_FULL_gfx11<0x048>;
+defm V_CVT_F16_U16 : VOP1_Real_gfx10_FULL_gfx11<0x050>;
+defm V_CVT_F16_I16 : VOP1_Real_gfx10_FULL_gfx11<0x051>;
+defm V_CVT_U16_F16 : VOP1_Real_gfx10_FULL_gfx11<0x052>;
+defm V_CVT_I16_F16 : VOP1_Real_gfx10_FULL_gfx11<0x053>;
+defm V_RCP_F16 : VOP1_Real_gfx10_FULL_gfx11<0x054>;
+defm V_SQRT_F16 : VOP1_Real_gfx10_FULL_gfx11<0x055>;
+defm V_RSQ_F16 : VOP1_Real_gfx10_FULL_gfx11<0x056>;
+defm V_LOG_F16 : VOP1_Real_gfx10_FULL_gfx11<0x057>;
+defm V_EXP_F16 : VOP1_Real_gfx10_FULL_gfx11<0x058>;
+defm V_FREXP_MANT_F16 : VOP1_Real_gfx10_FULL_gfx11<0x059>;
+defm V_FREXP_EXP_I16_F16 : VOP1_Real_gfx10_FULL_gfx11<0x05a>;
+defm V_FLOOR_F16 : VOP1_Real_gfx10_FULL_gfx11<0x05b>;
+defm V_CEIL_F16 : VOP1_Real_gfx10_FULL_gfx11<0x05c>;
+defm V_TRUNC_F16 : VOP1_Real_gfx10_FULL_gfx11<0x05d>;
+defm V_RNDNE_F16 : VOP1_Real_gfx10_FULL_gfx11<0x05e>;
+defm V_FRACT_F16 : VOP1_Real_gfx10_FULL_gfx11<0x05f>;
+defm V_SIN_F16 : VOP1_Real_gfx10_FULL_gfx11<0x060>;
+defm V_COS_F16 : VOP1_Real_gfx10_FULL_gfx11<0x061>;
+defm V_SAT_PK_U8_I16 : VOP1_Real_gfx10_FULL_gfx11<0x062>;
+defm V_CVT_NORM_I16_F16 : VOP1_Real_gfx10_FULL_gfx11<0x063>;
+defm V_CVT_NORM_U16_F16 : VOP1_Real_gfx10_FULL_gfx11<0x064>;
+
+defm V_SWAP_B32 : VOP1Only_Real_gfx10_gfx11<0x065>;
+defm V_SWAPREL_B32 : VOP1Only_Real_gfx10_gfx11<0x068>;
//===----------------------------------------------------------------------===//
// GFX7, GFX10.
@@ -635,16 +785,19 @@ multiclass VOP1_Real_gfx7<bits<9> op> :
multiclass VOP1_Real_gfx7_gfx10<bits<9> op> :
VOP1_Real_gfx7<op>, VOP1_Real_gfx10<op>;
+multiclass VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<bits<9> op> :
+ VOP1_Real_gfx7_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>;
+
defm V_LOG_LEGACY_F32 : VOP1_Real_gfx7<0x045>;
defm V_EXP_LEGACY_F32 : VOP1_Real_gfx7<0x046>;
-defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10<0x017>;
-defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10<0x018>;
-defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10<0x019>;
-defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10<0x01a>;
+defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x017>;
+defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x018>;
+defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x019>;
+defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x01a>;
//===----------------------------------------------------------------------===//
-// GFX6, GFX7, GFX10.
+// GFX6, GFX7, GFX10, GFX11.
//===----------------------------------------------------------------------===//
let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
@@ -666,65 +819,71 @@ multiclass VOP1_Real_gfx6_gfx7<bits<9> op> :
multiclass VOP1_Real_gfx6_gfx7_gfx10<bits<9> op> :
VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10<op>;
-defm V_LOG_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x026>;
-defm V_RCP_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x028>;
-defm V_RCP_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x029>;
-defm V_RSQ_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x02c>;
-defm V_RSQ_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x02d>;
-defm V_RCP_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x030>;
-defm V_RSQ_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x032>;
+multiclass VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<bits<9> op> :
+ VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_FULL_gfx11<op>;
-defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10<0x000>;
-defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x001>;
-defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x003>;
-defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x004>;
-defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x005>;
-defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x006>;
-defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x007>;
-defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x008>;
-defm V_CVT_F16_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00a>;
-defm V_CVT_F32_F16 : VOP1_Real_gfx6_gfx7_gfx10<0x00b>;
+multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<bits<9> op> :
+ VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>;
+
+defm V_LOG_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x026>;
+defm V_RCP_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x028>;
+defm V_RCP_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x029>;
+defm V_RSQ_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x02c>;
+defm V_RSQ_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x02d>;
+defm V_RCP_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x030>;
+defm V_RSQ_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x032>;
+
+defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x000>;
+defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x001>;
+defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x003>;
+defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x004>;
+defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x005>;
+defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x006>;
+defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x007>;
+defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x008>;
+defm V_CVT_F16_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x00a>;
+defm V_CVT_F32_F16 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x00b>;
defm V_CVT_RPI_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00c>;
defm V_CVT_FLR_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00d>;
-defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10<0x00e>;
-defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x00f>;
-defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x010>;
-defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10<0x011>;
-defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10<0x012>;
-defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10<0x013>;
-defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10<0x014>;
-defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x015>;
-defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x016>;
-defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x020>;
-defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x021>;
-defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x022>;
-defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x023>;
-defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x024>;
-defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x025>;
-defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x027>;
-defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02a>;
-defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02b>;
-defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02e>;
-defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x02f>;
-defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x031>;
-defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x033>;
-defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x034>;
-defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x035>;
-defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x036>;
-defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x037>;
-defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x038>;
+defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x00e>;
+defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x00f>;
+defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x010>;
+defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x011>;
+defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x012>;
+defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x013>;
+defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x014>;
+defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x015>;
+defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x016>;
+defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x020>;
+defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x021>;
+defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x022>;
+defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x023>;
+defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x024>;
+defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x025>;
+defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x027>;
+defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02a>;
+defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02b>;
+defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02e>;
+defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x02f>;
+defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x031>;
+defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x033>;
+defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x034>;
+defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x035>;
+defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x036>;
+defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x037>;
+defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x038>;
defm V_FFBH_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x039>;
defm V_FFBL_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x03a>;
defm V_FFBH_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x03b>;
-defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03c>;
-defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03d>;
-defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03e>;
-defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x03f>;
-defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x040>;
+defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03c>;
+defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03d>;
+defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03e>;
+defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x03f>;
+defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x040>;
defm V_CLREXCP : VOP1_Real_gfx6_gfx7_gfx10<0x041>;
-defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x042>;
-defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x043>;
-defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x044>;
+defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x042>;
+defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x043>;
+defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x044>;
//===----------------------------------------------------------------------===//
// GFX8, GFX9 (VI).
@@ -949,14 +1108,29 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
+let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in
+defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
+
//===----------------------------------------------------------------------===//
// GFX10
//===----------------------------------------------------------------------===//
-let OtherPredicates = [isGFX10Plus] in {
+let OtherPredicates = [isGFX10Only] in {
def : GCNPat <
(i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
(V_MOV_B32_dpp8_gfx10 VGPR_32:$src, VGPR_32:$src,
(as_i32timm $dpp8), (i32 DPP8Mode.FI_0))
>;
-} // End OtherPredicates = [isGFX10Plus]
+} // End OtherPredicates = [isGFX10Only]
+
+//===----------------------------------------------------------------------===//
+// GFX11
+//===----------------------------------------------------------------------===//
+
+let OtherPredicates = [isGFX11Only] in {
+def : GCNPat <
+ (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
+ (V_MOV_B32_dpp8_gfx11 VGPR_32:$src, VGPR_32:$src,
+ (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))
+>;
+} // End OtherPredicates = [isGFX11Only]
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index b9ff814a4dc5..1485a1e63129 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -80,9 +80,9 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf
let AsmVariantName = AMDGPUAsmVariants.Default;
}
-class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
+class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemonic> :
VOP_Real <ps>,
- InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
let VALU = 1;
@@ -140,15 +140,26 @@ multiclass VOP2Inst_e32<string opName,
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
} // End renamedInGFX9 = GFX9Renamed
}
-
+multiclass
+ VOP2Inst_e32_VOPD<string opName, VOPProfile P, bits<5> VOPDOp,
+ string VOPDName, SDPatternOperator node = null_frag,
+ string revOp = opName, bit GFX9Renamed = 0> {
+ defm NAME : VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>,
+ VOPD_Component<VOPDOp, VOPDName>;
+}
multiclass VOP2Inst_e64<string opName,
VOPProfile P,
SDPatternOperator node = null_frag,
string revOp = opName,
bit GFX9Renamed = 0> {
let renamedInGFX9 = GFX9Renamed in {
- def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+ def _e64 : VOP3InstBase <opName, P, node, 1>,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+ let SubtargetPredicate = isGFX11Plus in {
+ foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+ def _e64_dpp : VOP3_DPP_Pseudo <opName, P>;
+ } // End SubtargetPredicate = isGFX11Plus
} // End renamedInGFX9 = GFX9Renamed
}
@@ -175,6 +186,22 @@ multiclass VOP2Inst<string opName,
}
}
+multiclass VOP2Inst_VOPD<string opName,
+ VOPProfile P,
+ bits<5> VOPDOp,
+ string VOPDName,
+ SDPatternOperator node = null_frag,
+ string revOp = opName,
+ bit GFX9Renamed = 0> :
+ VOP2Inst_e32_VOPD<opName, P, VOPDOp, VOPDName, node, revOp, GFX9Renamed>,
+ VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>,
+ VOP2Inst_sdwa<opName, P, GFX9Renamed> {
+ let renamedInGFX9 = GFX9Renamed in {
+ foreach _ = BoolToList<P.HasExtDPP>.ret in
+ def _dpp : VOP2_DPP_Pseudo <opName, P>;
+ }
+}
+
multiclass VOP2bInst <string opName,
VOPProfile P,
SDPatternOperator node = null_frag,
@@ -195,10 +222,15 @@ multiclass VOP2bInst <string opName,
}
foreach _ = BoolToList<P.HasExtDPP>.ret in
def _dpp : VOP2_DPP_Pseudo <opName, P>;
- }
+ } // End Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC]
- def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+ def _e64 : VOP3InstBase <opName, P, node, 1>,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+ let SubtargetPredicate = isGFX11Plus in {
+ foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+ def _e64_dpp : VOP3_DPP_Pseudo <opName, P>;
+ } // End SubtargetPredicate = isGFX11Plus
}
}
}
@@ -220,16 +252,19 @@ multiclass VOP2bInstAliases<VOP2_Pseudo ps, VOP2_Real inst, string OpName> {
}
}
-multiclass VOP2eInst <string opName,
- VOPProfile P,
- SDPatternOperator node = null_frag,
- string revOp = opName,
- bit useSGPRInput = !eq(P.NumSrcArgs, 3)> {
+multiclass
+ VOP2eInst_Base<string opName, VOPProfile P, bits<5> VOPDOp, string VOPDName,
+ SDPatternOperator node, string revOp, bit useSGPRInput> {
let SchedRW = [Write32Bit] in {
let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in {
- def _e32 : VOP2_Pseudo <opName, P>,
- Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+ if !eq(VOPDOp, -1) then
+ def _e32 : VOP2_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+ else
+ def _e32 : VOP2_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)>,
+ VOPD_Component<VOPDOp, VOPDName>;
foreach _ = BoolToList<P.HasExtSDWA>.ret in
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
@@ -240,13 +275,29 @@ multiclass VOP2eInst <string opName,
def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
- def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+ def _e64 : VOP3InstBase <opName, P, node, 1>,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
let isReMaterializable = 1;
}
+
+ let SubtargetPredicate = isGFX11Plus in {
+ foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+ def _e64_dpp : VOP3_DPP_Pseudo <opName, P>;
+ } // End SubtargetPredicate = isGFX11Plus
}
}
+multiclass
+ VOP2eInst<string opName, VOPProfile P, SDPatternOperator node = null_frag,
+ string revOp = opName, bit useSGPRInput = !eq(P.NumSrcArgs, 3)>
+ : VOP2eInst_Base<opName, P, -1, "", node, revOp, useSGPRInput>;
+
+multiclass
+ VOP2eInst_VOPD<string opName, VOPProfile P, bits<5> VOPDOp, string VOPDName,
+ SDPatternOperator node = null_frag, string revOp = opName,
+ bit useSGPRInput = !eq(P.NumSrcArgs, 3)>
+ : VOP2eInst_Base<opName, P, VOPDOp, VOPDName, node, revOp, useSGPRInput>;
+
class VOP2eInstAlias <VOP2_Pseudo ps, Instruction inst, string opnd = ""> :
InstAlias <ps.OpName#" "#ps.Pfl.Asm32#", "#opnd,
(inst ps.Pfl.DstRC:$vdst, ps.Pfl.Src0RC32:$src0,
@@ -267,12 +318,24 @@ multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> {
}
}
-class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+class VOP_MADK_Base<ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+ string AsmVOPDXDeferred = ?;
+}
+
+class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> {
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
field dag Ins32 = !if(!eq(vt.Size, 32),
(ins VSrc_f32_Deferred:$src0, VGPR_32:$src1, ImmOpType:$imm),
(ins VSrc_f16_Deferred:$src0, VGPR_32:$src1, ImmOpType:$imm));
+ field dag InsVOPDX = (ins VSrc_f32_Deferred:$src0X, VGPR_32:$vsrc1X, ImmOpType:$imm);
+ // Note that both src0X and imm are deferred
+ let InsVOPDXDeferred = (ins VSrc_f32_Deferred:$src0X, VGPR_32:$vsrc1X, ImmOpType:$immDeferred);
+ field dag InsVOPDY = (ins VSrc_f32_Deferred:$src0Y, VGPR_32:$vsrc1Y, ImmOpType:$imm);
+
field string Asm32 = "$vdst, $src0, $src1, $imm";
+ field string AsmVOPDX = "$vdstX, $src0X, $vsrc1X, $imm";
+ let AsmVOPDXDeferred = "$vdstX, $src0X, $vsrc1X, $immDeferred";
+ field string AsmVOPDY = "$vdstY, $src0Y, $vsrc1Y, $imm";
field bit HasExt = 0;
let IsSingle = 1;
}
@@ -280,10 +343,17 @@ class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
def VOP_MADAK_F16 : VOP_MADAK <f16>;
def VOP_MADAK_F32 : VOP_MADAK <f32>;
-class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+class VOP_MADMK <ValueType vt> : VOP_MADK_Base<vt> {
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
field dag Ins32 = (ins VSrc_f32_Deferred:$src0, ImmOpType:$imm, VGPR_32:$src1);
+ field dag InsVOPDX = (ins VSrc_f32_Deferred:$src0X, ImmOpType:$imm, VGPR_32:$vsrc1X);
+ let InsVOPDXDeferred = (ins VSrc_f32_Deferred:$src0X, ImmOpType:$immDeferred, VGPR_32:$vsrc1X);
+ field dag InsVOPDY = (ins VSrc_f32_Deferred:$src0Y, ImmOpType:$imm, VGPR_32:$vsrc1Y);
+
field string Asm32 = "$vdst, $src0, $imm, $src1";
+ field string AsmVOPDX = "$vdstX, $src0X, $imm, $vsrc1X";
+ let AsmVOPDXDeferred = "$vdstX, $src0X, $immDeferred, $vsrc1X";
+ field string AsmVOPDY = "$vdstY, $src0Y, $imm, $vsrc1Y";
field bit HasExt = 0;
let IsSingle = 1;
}
@@ -308,6 +378,10 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+ let InsVOP3Base = getIns64<Src0VOP3DPP, Src1RC64, RegisterOperand<VGPR_32>, 3,
+ 0, HasModifiers, HasModifiers, HasOMod,
+ Src0Mod, Src1Mod, Src2Mod>.ret;
+
let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
@@ -330,6 +404,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
let HasExt = 1;
let HasExtDPP = 1;
+ let HasExt32BitDPP = 1;
let HasExtSDWA = 1;
let HasExtSDWA9 = 0;
let TieRegDPP = "$src2";
@@ -337,9 +412,9 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
def VOP_MAC_F16 : VOP_MAC <f16>;
def VOP_MAC_F32 : VOP_MAC <f32>;
-let HasExtDPP = 0 in
+let HasExtDPP = 0, HasExt32BitDPP = 0 in
def VOP_MAC_LEGACY_F32 : VOP_MAC <f32>;
-let HasExtSDWA = 0, HasExt64BitDPP = 1 in
+let HasExtSDWA = 0, HasExt32BitDPP = 0, HasExt64BitDPP = 1 in
def VOP_MAC_F64 : VOP_MAC <f64>;
class VOP_DOT_ACC<ValueType vt0, ValueType vt1> : VOP_MAC<vt0, vt1> {
@@ -355,6 +430,7 @@ def VOP_DOT_ACC_F32_V2F16 : VOP_DOT_ACC<f32, v2f16> {
}
def VOP_DOT_ACC_I32_I32 : VOP_DOT_ACC<i32, i32> {
+ let HasExtVOP3DPP = 0;
let HasSrc0Mods = 1;
let HasSrc1Mods = 1;
}
@@ -368,13 +444,27 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp
let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
let AsmDPP8 = "$vdst, vcc, $src0, $src1 $dpp8$fi";
let AsmDPP16 = AsmDPP#"$fi";
+ let AsmVOP3DPPBase = Asm64;
+ let InsDPP = (ins DstRCDPP:$old,
+ Src0DPP:$src0,
+ Src1DPP:$src1,
+ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+ let InsDPP8 = (ins DstRCDPP:$old,
+ Src0DPP:$src0,
+ Src1DPP:$src1,
+ dpp8:$dpp8, FI:$fi);
let Outs32 = (outs DstRC:$vdst);
let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
+ let OutsVOP3DPP = Outs64;
+ let OutsVOP3DPP8 = Outs64;
}
// Write out to vcc or arbitrary SGPR and read in from vcc or
// arbitrary SGPR.
def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*/1> {
+ let HasSrc2Mods = 0;
let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
@@ -384,6 +474,9 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*
let AsmDPP16 = AsmDPP#"$fi";
let Outs32 = (outs DstRC:$vdst);
let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
+ let AsmVOP3DPPBase = Asm64;
+ let OutsVOP3DPP = Outs64;
+ let OutsVOP3DPP8 = Outs64;
// Suppress src2 implied by type since the 32-bit encoding uses an
// implicit VCC use.
@@ -401,15 +494,20 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+ let InsDPP8 = (ins DstRCDPP:$old,
+ Src0DPP:$src0,
+ Src1DPP:$src1,
+ dpp8:$dpp8, FI:$fi);
let HasExt = 1;
let HasExtDPP = 1;
+ let HasExt32BitDPP = 1;
let HasExtSDWA = 1;
let HasExtSDWA9 = 1;
}
// Read in from vcc or arbitrary SGPR.
-def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/1> {
+class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT, /*EnableF32SrcMods=*/1> {
let Asm32 = "$vdst, $src0, $src1";
let Asm64 = "$vdst, $src0_modifiers, $src1_modifiers, $src2";
let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
@@ -417,6 +515,7 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/
let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
let AsmDPP8 = "$vdst, $src0, $src1, vcc $dpp8$fi";
let AsmDPP16 = AsmDPP#"$fi";
+ let AsmVOP3DPPBase = Asm64;
let Outs32 = (outs DstRC:$vdst);
let Outs64 = (outs DstRC:$vdst);
@@ -437,14 +536,22 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+ let InsDPP8 = (ins DstRCDPP:$old,
+ Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+ Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
+ dpp8:$dpp8, FI:$fi);
let HasExt = 1;
let HasExtDPP = 1;
+ let HasExt32BitDPP = 1;
let HasExtSDWA = 1;
let HasExtSDWA9 = 1;
}
-def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
+def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>;
+def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>;
+
+def VOP_READLANE : VOPProfile<[i32, i32, i32, untyped]> {
let Outs32 = (outs SReg_32:$vdst);
let Outs64 = Outs32;
let Ins32 = (ins VRegOrLds_32:$src0, SCSrc_b32:$src1);
@@ -454,6 +561,7 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
let HasExt = 0;
let HasExtDPP = 0;
+ let HasExt32BitDPP = 0;
let HasExt64BitDPP = 0;
let HasExtSDWA = 0;
let HasExtSDWA9 = 0;
@@ -471,6 +579,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
let HasExt = 0;
let HasExtDPP = 0;
+ let HasExt32BitDPP = 0;
let HasExt64BitDPP = 0;
let HasExtSDWA = 0;
let HasExtSDWA9 = 0;
@@ -480,31 +589,33 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
// VOP2 Instructions
//===----------------------------------------------------------------------===//
-defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
+let SubtargetPredicate = isGFX11Plus in
+defm V_CNDMASK_B16 : VOP2eInst <"v_cndmask_b16", VOP2e_I16_I16_I16_I1>;
+defm V_CNDMASK_B32 : VOP2eInst_VOPD <"v_cndmask_b32", VOP2e_I32_I32_I32_I1, 0x9, "v_cndmask_b32">;
let SubtargetPredicate = HasMadMacF32Insts, isReMaterializable = 1 in
def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
let isCommutable = 1 in {
let isReMaterializable = 1 in {
-defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, any_fadd>;
-defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, any_fsub>;
-defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
-defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
-defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, any_fmul>;
+defm V_ADD_F32 : VOP2Inst_VOPD <"v_add_f32", VOP_F32_F32_F32, 0x4, "v_add_f32", any_fadd>;
+defm V_SUB_F32 : VOP2Inst_VOPD <"v_sub_f32", VOP_F32_F32_F32, 0x5, "v_sub_f32", any_fsub>;
+defm V_SUBREV_F32 : VOP2Inst_VOPD <"v_subrev_f32", VOP_F32_F32_F32, 0x6, "v_subrev_f32", null_frag, "v_sub_f32">;
+defm V_MUL_LEGACY_F32 : VOP2Inst_VOPD <"v_mul_legacy_f32", VOP_F32_F32_F32, 0x7, "v_mul_dx9_zero_f32", AMDGPUfmul_legacy>;
+defm V_MUL_F32 : VOP2Inst_VOPD <"v_mul_f32", VOP_F32_F32_F32, 0x3, "v_mul_f32", any_fmul>;
defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32_ARITH, AMDGPUmul_i24>;
defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>;
defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32_ARITH, AMDGPUmul_u24>;
defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>;
-defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>;
-defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>;
+defm V_MIN_F32 : VOP2Inst_VOPD <"v_min_f32", VOP_F32_F32_F32, 0xb, "v_min_f32", fminnum_like>;
+defm V_MAX_F32 : VOP2Inst_VOPD <"v_max_f32", VOP_F32_F32_F32, 0xa, "v_max_f32", fmaxnum_like>;
defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>;
defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, clshr_rev_32, "v_lshr_b32">;
defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, cashr_rev_32, "v_ashr_i32">;
-defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, clshl_rev_32, "v_lshl_b32">;
-defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
+defm V_LSHLREV_B32 : VOP2Inst_VOPD <"v_lshlrev_b32", VOP_I32_I32_I32, 0x11, "v_lshlrev_b32", clshl_rev_32, "v_lshl_b32">;
+defm V_AND_B32 : VOP2Inst_VOPD <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, 0x12, "v_and_b32", and>;
defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
} // End isReMaterializable = 1
@@ -536,7 +647,7 @@ defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_f
let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in {
-defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_add_u32", 1>;
+defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32", 1>;
defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
}
@@ -555,20 +666,20 @@ def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
} // End isConvergent = 1
let isReMaterializable = 1 in {
-defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
-defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, add_ctpop>;
-defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>;
-defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
-defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
+defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
+defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>;
+defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>;
+defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>;
+defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
let ReadsModeReg = 0, mayRaiseFPException = 0 in {
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_i16_f32>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_u16_f32>;
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_i16_f32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_u16_f32>;
}
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_V2F16_F32_F32>, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_V2F16_F32_F32, AMDGPUpkrtz_f16_f32>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_V2I16_I32_I32, AMDGPUpk_u16_u32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_V2I16_I32_I32, AMDGPUpk_i16_i32>;
let SubtargetPredicate = isGFX6GFX7 in {
@@ -641,8 +752,9 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>;
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
-let SubtargetPredicate = Has16BitInsts in {
+let SubtargetPredicate = Has16BitInsts in {
+let isReMaterializable = 1 in {
let FPDPRounding = 1 in {
def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
@@ -664,9 +776,7 @@ def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
}
} // End FPDPRounding = 1
-defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>;
-defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>;
-defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">;
+
defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>;
defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
@@ -675,12 +785,19 @@ defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16, smax>;
defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>;
defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>;
-let Constraints = "$vdst = $src2", DisableEncoding="$src2",
- isConvertibleToThreeAddress = 1 in {
-defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
+let SubtargetPredicate = isGFX8GFX9 in {
+ defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>;
+ defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>;
+ defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">;
}
} // End isCommutable = 1
+} // End isReMaterializable = 1
+// FIXME: Missing FPDPRounding
+let Constraints = "$vdst = $src2", DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1, isCommutable = 1 in {
+defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
+}
} // End SubtargetPredicate = Has16BitInsts
let SubtargetPredicate = HasDLInsts in {
@@ -722,7 +839,7 @@ let Constraints = "$vdst = $src2",
DisableEncoding = "$src2",
isConvertibleToThreeAddress = 1,
isCommutable = 1 in
-defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
+defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">;
} // End SubtargetPredicate = HasDLInsts
@@ -750,7 +867,7 @@ let Constraints = "$vdst = $src2",
isCommutable = 1,
IsDOT = 1 in {
let SubtargetPredicate = HasDot5Insts in
- defm V_DOT2C_F32_F16 : VOP2Inst<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>;
+ defm V_DOT2C_F32_F16 : VOP2Inst_VOPD<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16, 0xc, "v_dot2acc_f32_f16">;
let SubtargetPredicate = HasDot6Insts in
defm V_DOT4C_I32_I8 : VOP2Inst<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>;
@@ -788,20 +905,20 @@ let AddedComplexity = 30 in {
} // End AddedComplexity = 30
let SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1 in {
-def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">;
+def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">, VOPD_Component<0x2, "v_fmamk_f32">;
let isCommutable = 1 in
-def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">;
+def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">, VOPD_Component<0x1, "v_fmaak_f32">;
}
let SubtargetPredicate = isGFX10Plus in {
-let FPDPRounding = 1 in {
+let FPDPRounding = 1, isReMaterializable = 1 in {
def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
let isCommutable = 1 in
def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">;
-} // End FPDPRounding = 1
+} // End FPDPRounding = 1, isReMaterializable = 1
let Constraints = "$vdst = $src2",
DisableEncoding="$src2",
@@ -857,7 +974,7 @@ def : GCNPat <
>;
}
-let Predicates = [Has16BitInsts] in {
+let Predicates = [Has16BitInsts, isGFX8GFX9] in {
// Undo sub x, c -> add x, -c canonicalization since c is more likely
// an inline immediate than -c.
@@ -867,9 +984,6 @@ def : GCNPat<
(V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1)
>;
-
-let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
-
def : GCNPat<
(i32 (zext (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)))),
(V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1)
@@ -885,7 +999,10 @@ defm : Arithmetic_i16_0Hi_Pats<umax, V_MAX_U16_e64>;
defm : Arithmetic_i16_0Hi_Pats<clshl_rev_16, V_LSHLREV_B16_e64>;
defm : Arithmetic_i16_0Hi_Pats<clshr_rev_16, V_LSHRREV_B16_e64>;
defm : Arithmetic_i16_0Hi_Pats<cashr_rev_16, V_ASHRREV_I16_e64>;
-} // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9]
+
+} // End Predicates = [Has16BitInsts, isGFX8GFX9]
+
+let Predicates = [Has16BitInsts] in {
def : ZExt_i16_i1_Pat<zext>;
def : ZExt_i16_i1_Pat<anyext>;
@@ -917,8 +1034,16 @@ def : VOPBinOpClampPat<uaddsat, V_ADD_U16_e64, i16>;
def : VOPBinOpClampPat<usubsat, V_SUB_U16_e64, i16>;
}
+let SubtargetPredicate = isGFX11Plus in {
+ let isCommutable = 1 in {
+ defm V_AND_B16 : VOP2Inst <"v_and_b16", VOP_I16_I16_I16, and>;
+ defm V_OR_B16 : VOP2Inst <"v_or_b16", VOP_I16_I16_I16, or>;
+ defm V_XOR_B16 : VOP2Inst <"v_xor_b16", VOP_I16_I16_I16, xor>;
+ } // End isCommutable = 1
+} // End SubtargetPredicate = isGFX11Plus
+
//===----------------------------------------------------------------------===//
-// Target-specific instruction encodings.
+// DPP Encodings
//===----------------------------------------------------------------------===//
class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps,
@@ -947,10 +1072,10 @@ class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
let OtherPredicates = ps.OtherPredicates;
}
-class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
+class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, int subtarget,
string opName = ps.OpName, VOPProfile p = ps.Pfl> :
Base_VOP2_DPP16<op, ps, opName, p>,
- SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10>;
+ SIMCInstr <ps.PseudoInstr, subtarget>;
class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
VOPProfile p = ps.Pfl> :
@@ -973,10 +1098,253 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
}
//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
+ //===------------------------------- VOP2 -------------------------------===//
+ multiclass VOP2Only_Real_MADK_gfx11<bits<6> op> {
+ def _gfx11 :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.GFX11>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP2_Real_e32_gfx11<bits<6> op> {
+ def _e32_gfx11 :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX11>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+ }
+ multiclass VOP2Only_Real_e32_gfx11<bits<6> op> {
+ let IsSingle = 1 in
+ defm NAME: VOP2_Real_e32_gfx11<op>;
+ }
+ multiclass VOP2_Real_e64_gfx11<bits<6> op> {
+ def _e64_gfx11 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX11>,
+ VOP3e_gfx11<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+ multiclass VOP2_Real_dpp_gfx11<bits<6> op> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX11> {
+ let DecoderNamespace = "DPPGFX11";
+ }
+ }
+ multiclass VOP2_Real_dpp8_gfx11<bits<6> op> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp8_gfx11 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
+ let DecoderNamespace = "DPP8GFX11";
+ }
+ }
+
+ //===------------------------- VOP2 (with name) -------------------------===//
+ multiclass VOP2_Real_e32_with_name_gfx11<bits<6> op, string opName,
+ string asmName, bit single = 0> {
+ defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ def _e32_gfx11 :
+ VOP2_Real<ps, SIEncodingFamily.GFX11, asmName>,
+ VOP2e<op{5-0}, ps.Pfl>,
+ MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]> {
+ let AsmString = asmName # ps.AsmOperands;
+ let IsSingle = single;
+ }
+ }
+ multiclass VOP2_Real_e64_with_name_gfx11<bits<6> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ def _e64_gfx11 :
+ VOP3_Real<ps, SIEncodingFamily.GFX11>,
+ VOP3e_gfx11<{0, 1, 0, 0, op{5-0}}, ps.Pfl>,
+ MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]> {
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+
+ multiclass VOP2_Real_dpp_with_name_gfx11<bits<6> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ foreach _ = BoolToList<ps.Pfl.HasExtDPP>.ret in
+ def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"),
+ SIEncodingFamily.GFX11> {
+ let AsmString = asmName # ps.Pfl.AsmDPP16;
+ let DecoderNamespace = "DPPGFX11";
+ }
+ }
+ multiclass VOP2_Real_dpp8_with_name_gfx11<bits<6> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ foreach _ = BoolToList<ps.Pfl.HasExtDPP>.ret in
+ def _dpp8_gfx11 : VOP2_DPP8<op, ps> {
+ let AsmString = asmName # ps.Pfl.AsmDPP8;
+ let DecoderNamespace = "DPP8GFX11";
+ }
+ }
+
+ //===------------------------------ VOP2be ------------------------------===//
+ multiclass VOP2be_Real_e32_gfx11<bits<6> op, string opName, string asmName> {
+ defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ def _e32_gfx11 :
+ VOP2_Real<ps, SIEncodingFamily.GFX11>,
+ VOP2e<op{5-0}, ps.Pfl> {
+ let AsmString = asmName # !subst(", vcc", "", ps.AsmOperands);
+ }
+ }
+ multiclass VOP2be_Real_dpp_gfx11<bits<6> op, string opName, string asmName> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx11 :
+ VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX11, asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # !subst(", vcc", "", AsmDPP);
+ let DecoderNamespace = "DPPGFX11";
+ }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_w32_gfx11 :
+ Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP);
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_w64_gfx11 :
+ Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
+ }
+ multiclass VOP2be_Real_dpp8_gfx11<bits<6> op, string opName, string asmName> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp8_gfx11 :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
+ let DecoderNamespace = "DPP8GFX11";
+ }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp8_w32_gfx11 :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8);
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp8_w64_gfx11 :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
+ }
+
+} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11"
+
+// We don't want to override separate decoderNamespaces within these
+multiclass VOP2_Realtriple_e64_gfx11<bits<6> op> {
+ defm NAME : VOP3_Realtriple_gfx11<{0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, NAME> ;
+}
+multiclass VOP2_Realtriple_e64_with_name_gfx11<bits<6> op, string opName,
+ string asmName> {
+ defm NAME : VOP3_Realtriple_with_name_gfx11<{0, 1, 0, 0, op{5-0}}, opName, asmName> ;
+}
+
+multiclass VOP2be_Real_gfx11<bits<6> op, string opName, string asmName> :
+ VOP2be_Real_e32_gfx11<op, opName, asmName>,
+ VOP3be_Realtriple_gfx11<{0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, opName, asmName>,
+ VOP2be_Real_dpp_gfx11<op, opName, asmName>,
+ VOP2be_Real_dpp8_gfx11<op, opName, asmName>;
+
+// Only for CNDMASK
+multiclass VOP2e_Real_gfx11<bits<6> op, string opName, string asmName> :
+ VOP2_Real_e32_gfx11<op>,
+ VOP2_Realtriple_e64_gfx11<op>,
+ VOP2be_Real_dpp_gfx11<op, opName, asmName>,
+ VOP2be_Real_dpp8_gfx11<op, opName, asmName>;
+
+multiclass VOP2Only_Real_gfx11<bits<6> op> :
+ VOP2Only_Real_e32_gfx11<op>,
+ VOP2_Real_dpp_gfx11<op>,
+ VOP2_Real_dpp8_gfx11<op>;
+
+multiclass VOP2_Real_NO_VOP3_gfx11<bits<6> op> :
+ VOP2_Real_e32_gfx11<op>, VOP2_Real_dpp_gfx11<op>, VOP2_Real_dpp8_gfx11<op>;
+
+multiclass VOP2_Real_FULL_gfx11<bits<6> op> :
+ VOP2_Realtriple_e64_gfx11<op>, VOP2_Real_NO_VOP3_gfx11<op>;
+
+multiclass VOP2_Real_NO_VOP3_with_name_gfx11<bits<6> op, string opName,
+ string asmName, bit isSingle = 0> :
+ VOP2_Real_e32_with_name_gfx11<op, opName, asmName, isSingle>,
+ VOP2_Real_dpp_with_name_gfx11<op, opName, asmName>,
+ VOP2_Real_dpp8_with_name_gfx11<op, opName, asmName>;
+
+multiclass VOP2_Real_FULL_with_name_gfx11<bits<6> op, string opName,
+ string asmName> :
+ VOP2_Realtriple_e64_with_name_gfx11<op, opName, asmName>,
+ VOP2_Real_NO_VOP3_with_name_gfx11<op, opName, asmName>;
+
+multiclass VOP2_Real_NO_DPP_gfx11<bits<6> op> :
+ VOP2_Real_e32_gfx11<op>, VOP2_Real_e64_gfx11<op>;
+
+multiclass VOP2_Real_NO_DPP_with_name_gfx11<bits<6> op, string opName,
+ string asmName> :
+ VOP2_Real_e32_with_name_gfx11<op, opName, asmName>,
+ VOP2_Real_e64_with_name_gfx11<op, opName, asmName>;
+
+defm V_CNDMASK_B32 : VOP2e_Real_gfx11<0x001, "V_CNDMASK_B32",
+ "v_cndmask_b32">;
+defm V_DOT2ACC_F32_F16 : VOP2_Real_NO_VOP3_with_name_gfx11<0x002,
+ "V_DOT2C_F32_F16", "v_dot2acc_f32_f16", 1>;
+defm V_FMAC_DX9_ZERO_F32 : VOP2_Real_NO_DPP_with_name_gfx11<0x006,
+ "V_FMAC_LEGACY_F32", "v_fmac_dx9_zero_f32">;
+defm V_MUL_DX9_ZERO_F32 : VOP2_Real_FULL_with_name_gfx11<0x007,
+ "V_MUL_LEGACY_F32", "v_mul_dx9_zero_f32">;
+defm V_LSHLREV_B32 : VOP2_Real_FULL_gfx11<0x018>;
+defm V_LSHRREV_B32 : VOP2_Real_FULL_gfx11<0x019>;
+defm V_ASHRREV_I32 : VOP2_Real_FULL_gfx11<0x01a>;
+defm V_ADD_CO_CI_U32 :
+ VOP2be_Real_gfx11<0x020, "V_ADDC_U32", "v_add_co_ci_u32">;
+defm V_SUB_CO_CI_U32 :
+ VOP2be_Real_gfx11<0x021, "V_SUBB_U32", "v_sub_co_ci_u32">;
+defm V_SUBREV_CO_CI_U32 :
+ VOP2be_Real_gfx11<0x022, "V_SUBBREV_U32", "v_subrev_co_ci_u32">;
+
+defm V_CVT_PK_RTZ_F16_F32 : VOP2_Real_FULL_with_name_gfx11<0x02f,
+ "V_CVT_PKRTZ_F16_F32", "v_cvt_pk_rtz_f16_f32">;
+defm V_PK_FMAC_F16 : VOP2Only_Real_gfx11<0x03c>;
+
+// VOP3 only.
+defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11<0x25d>;
+defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11<0x31c>;
+defm V_BFM_B32 : VOP3Only_Realtriple_gfx11<0x31d>;
+defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11<0x31e>;
+defm V_MBCNT_LO_U32_B32 : VOP3Only_Realtriple_gfx11<0x31f>;
+defm V_MBCNT_HI_U32_B32 : VOP3Only_Realtriple_gfx11<0x320>;
+defm V_CVT_PKNORM_I16_F32 : VOP3Only_Realtriple_gfx11<0x321>;
+defm V_CVT_PKNORM_U16_F32 : VOP3Only_Realtriple_gfx11<0x322>;
+defm V_CVT_PK_U16_U32 : VOP3Only_Realtriple_gfx11<0x323>;
+defm V_CVT_PK_I16_I32 : VOP3Only_Realtriple_gfx11<0x324>;
+defm V_ADD_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x300>;
+defm V_SUB_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x301>;
+defm V_SUBREV_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x302>;
+
+let SubtargetPredicate = isGFX11Plus in {
+ defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx11>;
+
+ defm : VOP2bInstAliases<
+ V_ADDC_U32_e32, V_ADD_CO_CI_U32_e32_gfx11, "v_add_co_ci_u32">;
+ defm : VOP2bInstAliases<
+ V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx11, "v_sub_co_ci_u32">;
+ defm : VOP2bInstAliases<
+ V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx11, "v_subrev_co_ci_u32">;
+} // End SubtargetPredicate = isGFX11Plus
+
+//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
//===------------------------------- VOP2 -------------------------------===//
multiclass VOP2Only_Real_MADK_gfx10<bits<6> op> {
def _gfx10 :
@@ -1011,13 +1379,13 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
}
multiclass VOP2_Real_dpp_gfx10<bits<6> op> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
- def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in
+ def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX10> {
let DecoderNamespace = "SDWA10";
}
}
multiclass VOP2_Real_dpp8_gfx10<bits<6> op> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in
def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
let DecoderNamespace = "DPP8";
}
@@ -1056,15 +1424,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName,
string asmName> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
- def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp")> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
+ def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX10> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP16;
}
}
multiclass VOP2_Real_dpp8_gfx10_with_name<bits<6> op, string opName,
string asmName> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP8;
@@ -1122,14 +1490,14 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
}
multiclass VOP2be_Real_dpp_gfx10<bits<6> op, string opName, string asmName> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
def _dpp_gfx10 :
- VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+ VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX10, asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
let AsmString = asmName # !subst(", vcc", "", AsmDPP);
let DecoderNamespace = "SDWA10";
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
def _dpp_w32_gfx10 :
Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
@@ -1137,7 +1505,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
def _dpp_w64_gfx10 :
Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
@@ -1147,14 +1515,14 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
}
multiclass VOP2be_Real_dpp8_gfx10<bits<6> op, string opName, string asmName> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
def _dpp8_gfx10 :
VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
let DecoderNamespace = "DPP8";
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
def _dpp8_w32_gfx10 :
VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
@@ -1162,7 +1530,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
def _dpp8_w64_gfx10 :
VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
@@ -1189,7 +1557,10 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
let IsSingle = 1;
}
}
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
+
+multiclass VOP2Only_Real_MADK_gfx10_gfx11<bits<6> op> :
+ VOP2Only_Real_MADK_gfx10<op>, VOP2Only_Real_MADK_gfx11<op>;
multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> :
VOP2be_Real_e32_gfx10<op, opName, asmName>,
@@ -1209,7 +1580,10 @@ multiclass VOP2_Real_gfx10<bits<6> op> :
VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>,
VOP2_Real_sdwa_gfx10<op>, VOP2_Real_dpp_gfx10<op>, VOP2_Real_dpp8_gfx10<op>;
-multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName,
+multiclass VOP2_Real_gfx10_gfx11<bits<6> op> :
+ VOP2_Real_gfx10<op>, VOP2_Real_FULL_gfx11<op>;
+
+multiclass VOP2_Real_with_name_gfx10<bits<6> op, string opName,
string asmName> :
VOP2_Real_e32_gfx10_with_name<op, opName, asmName>,
VOP2_Real_e64_gfx10_with_name<op, opName, asmName>,
@@ -1217,36 +1591,41 @@ multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName,
VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>,
VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>;
+multiclass VOP2_Real_with_name_gfx10_gfx11<bits<6> op, string opName,
+ string asmName> :
+ VOP2_Real_with_name_gfx10<op, opName, asmName>,
+ VOP2_Real_FULL_with_name_gfx11<op, opName, asmName>;
+
// NB: Same opcode as v_mac_legacy_f32
let DecoderNamespace = "GFX10_B" in
defm V_FMAC_LEGACY_F32 : VOP2_Real_gfx10<0x006>;
-defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>;
-defm V_FMAC_F32 : VOP2_Real_gfx10<0x02b>;
-defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10<0x02c>;
-defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10<0x02d>;
-defm V_ADD_F16 : VOP2_Real_gfx10<0x032>;
-defm V_SUB_F16 : VOP2_Real_gfx10<0x033>;
-defm V_SUBREV_F16 : VOP2_Real_gfx10<0x034>;
-defm V_MUL_F16 : VOP2_Real_gfx10<0x035>;
-defm V_FMAC_F16 : VOP2_Real_gfx10<0x036>;
-defm V_FMAMK_F16 : VOP2Only_Real_MADK_gfx10<0x037>;
-defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10<0x038>;
-defm V_MAX_F16 : VOP2_Real_gfx10<0x039>;
-defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>;
-defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>;
+defm V_XNOR_B32 : VOP2_Real_gfx10_gfx11<0x01e>;
+defm V_FMAC_F32 : VOP2_Real_gfx10_gfx11<0x02b>;
+defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10_gfx11<0x02c>;
+defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10_gfx11<0x02d>;
+defm V_ADD_F16 : VOP2_Real_gfx10_gfx11<0x032>;
+defm V_SUB_F16 : VOP2_Real_gfx10_gfx11<0x033>;
+defm V_SUBREV_F16 : VOP2_Real_gfx10_gfx11<0x034>;
+defm V_MUL_F16 : VOP2_Real_gfx10_gfx11<0x035>;
+defm V_FMAC_F16 : VOP2_Real_gfx10_gfx11<0x036>;
+defm V_FMAMK_F16 : VOP2Only_Real_MADK_gfx10_gfx11<0x037>;
+defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10_gfx11<0x038>;
+defm V_MAX_F16 : VOP2_Real_gfx10_gfx11<0x039>;
+defm V_MIN_F16 : VOP2_Real_gfx10_gfx11<0x03a>;
+defm V_LDEXP_F16 : VOP2_Real_gfx10_gfx11<0x03b>;
let IsSingle = 1 in {
-defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>;
+ defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>;
}
// VOP2 no carry-in, carry-out.
defm V_ADD_NC_U32 :
- VOP2_Real_gfx10_with_name<0x025, "V_ADD_U32", "v_add_nc_u32">;
+ VOP2_Real_with_name_gfx10_gfx11<0x025, "V_ADD_U32", "v_add_nc_u32">;
defm V_SUB_NC_U32 :
- VOP2_Real_gfx10_with_name<0x026, "V_SUB_U32", "v_sub_nc_u32">;
+ VOP2_Real_with_name_gfx10_gfx11<0x026, "V_SUB_U32", "v_sub_nc_u32">;
defm V_SUBREV_NC_U32 :
- VOP2_Real_gfx10_with_name<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">;
+ VOP2_Real_with_name_gfx10_gfx11<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">;
// VOP2 carry-in, carry-out.
defm V_ADD_CO_CI_U32 :
@@ -1275,7 +1654,7 @@ defm V_ADD_CO_U32 : VOP3beOnly_Real_gfx10<0x30f>;
defm V_SUB_CO_U32 : VOP3beOnly_Real_gfx10<0x310>;
defm V_SUBREV_CO_U32 : VOP3beOnly_Real_gfx10<0x319>;
-let SubtargetPredicate = isGFX10Plus in {
+let SubtargetPredicate = isGFX10Only in {
defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx10>;
defm : VOP2bInstAliases<
@@ -1284,10 +1663,10 @@ let SubtargetPredicate = isGFX10Plus in {
V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx10, "v_sub_co_ci_u32">;
defm : VOP2bInstAliases<
V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx10, "v_subrev_co_ci_u32">;
-} // End SubtargetPredicate = isGFX10Plus
+} // End SubtargetPredicate = isGFX10Only
//===----------------------------------------------------------------------===//
-// GFX6, GFX7, GFX10.
+// GFX6, GFX7, GFX10, GFX11
//===----------------------------------------------------------------------===//
class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
@@ -1338,6 +1717,9 @@ multiclass VOP2_Real_gfx6_gfx7<bits<6> op> :
multiclass VOP2_Real_gfx6_gfx7_gfx10<bits<6> op> :
VOP2_Real_gfx6_gfx7<op>, VOP2_Real_gfx10<op>;
+multiclass VOP2_Real_gfx6_gfx7_gfx10_gfx11<bits<6> op> :
+ VOP2_Real_gfx6_gfx7_gfx10<op>, VOP2_Real_FULL_gfx11<op>;
+
multiclass VOP2be_Real_gfx6_gfx7<bits<6> op> :
VOP2_Real_e32_gfx6_gfx7<op>, VOP2be_Real_e64_gfx6_gfx7<op>;
@@ -1398,28 +1780,28 @@ let SubtargetPredicate = isGFX6GFX7 in {
def : VOP2e64InstAlias<V_SUBREV_CO_U32_e64, V_SUBREV_I32_e64_gfx6_gfx7>;
} // End SubtargetPredicate = isGFX6GFX7
-defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x003>;
-defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x004>;
-defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x005>;
+defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x003>;
+defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x004>;
+defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x005>;
defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>;
defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>;
-defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x008>;
-defm V_MUL_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x009>;
-defm V_MUL_HI_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x00a>;
-defm V_MUL_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00b>;
-defm V_MUL_HI_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00c>;
-defm V_MIN_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x00f>;
-defm V_MAX_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x010>;
-defm V_MIN_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x011>;
-defm V_MAX_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x012>;
-defm V_MIN_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x013>;
-defm V_MAX_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x014>;
+defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x008>;
+defm V_MUL_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x009>;
+defm V_MUL_HI_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00a>;
+defm V_MUL_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00b>;
+defm V_MUL_HI_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00c>;
+defm V_MIN_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00f>;
+defm V_MAX_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x010>;
+defm V_MIN_I32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x011>;
+defm V_MAX_I32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x012>;
+defm V_MIN_U32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x013>;
+defm V_MAX_U32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x014>;
defm V_LSHRREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x016>;
defm V_ASHRREV_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x018>;
defm V_LSHLREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01a>;
-defm V_AND_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01b>;
-defm V_OR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01c>;
-defm V_XOR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01d>;
+defm V_AND_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01b>;
+defm V_OR_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01c>;
+defm V_XOR_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01d>;
defm V_MAC_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x01f>;
defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x02f>;
defm V_MADMK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x020>;
@@ -1436,6 +1818,13 @@ multiclass VOP2_Real_MADK_vi <bits<6> op> {
VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
}
+multiclass VOP2_Real_MADK_gfx940 <bits<6> op> {
+ def _gfx940 : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.GFX940>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl> {
+ let DecoderNamespace = "GFX9";
+ }
+}
+
multiclass VOP2_Real_e32_vi <bits<6> op> {
def _e32_vi :
VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
@@ -1736,6 +2125,11 @@ let SubtargetPredicate = isGFX90APlus in {
}
} // End SubtargetPredicate = isGFX90APlus
+let SubtargetPredicate = HasFmaakFmamkF32Insts in {
+defm V_FMAMK_F32 : VOP2_Real_MADK_gfx940 <0x17>;
+defm V_FMAAK_F32 : VOP2_Real_MADK_gfx940 <0x18>;
+}
+
multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> {
def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 494e3aeb6d55..dddd0aacc140 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -6,191 +6,25 @@
//
//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// VOP3 Classes
-//===----------------------------------------------------------------------===//
-
-class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
- dag src0 = !if(P.HasOMod,
- (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
- (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
-
- list<dag> ret3 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
- (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
-
- list<dag> ret2 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
-
- list<dag> ret1 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT src0)))];
-
- list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
- !if(!eq(P.NumSrcArgs, 2), ret2,
- ret1));
-}
-
-class getVOP3PModPat<VOPProfile P, SDPatternOperator node, bit HasExplicitClamp> {
- dag src0_dag = (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers));
- dag src1_dag = (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers));
- dag src2_dag = (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers));
- dag clamp_dag = (i1 timm:$clamp);
-
- list<dag> ret3 = [(set P.DstVT:$vdst,
- !if(HasExplicitClamp,
- (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag, clamp_dag),
- (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag)))];
-
- list<dag> ret2 = [(set P.DstVT:$vdst,
- !if(HasExplicitClamp,
- (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, clamp_dag),
- (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag)))];
-
- list<dag> ret1 = [(set P.DstVT:$vdst,
- !if(HasExplicitClamp,
- (DivergentFragOrOp<node, P>.ret src0_dag, clamp_dag),
- (DivergentFragOrOp<node, P>.ret src0_dag)))];
-
- list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
- !if(!eq(P.NumSrcArgs, 2), ret2,
- ret1));
-}
-
-class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
- list<dag> ret3 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)),
- (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)),
- (P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))];
-
- list<dag> ret2 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)),
- (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))];
-
- list<dag> ret1 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))))];
-
- list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
- !if(!eq(P.NumSrcArgs, 2), ret2,
- ret1));
-}
-
-class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> {
- list<dag> ret3 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers),
- (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
- (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)),
- (P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))];
-
- list<dag> ret2 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers)),
- (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
- (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))];
-
- list<dag> ret1 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))))];
-
- list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
- !if(!eq(P.NumSrcArgs, 2), ret2,
- ret1));
-}
-
-class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
- list<dag> ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))];
- list<dag> ret2 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0, P.Src1VT:$src1))];
- list<dag> ret1 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0))];
- list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
- !if(!eq(P.NumSrcArgs, 2), ret2,
- ret1));
-}
-
-class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> {
- list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i1:$clamp))];
- list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, i1:$clamp))];
- list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, i1:$clamp))];
- list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
- !if(!eq(P.NumSrcArgs, 2), ret2,
- ret1));
-}
-
-class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> {
- list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
- timm:$cbsz, timm:$abid, timm:$blgp))];
-}
-
-// Consistently gives instructions a _e64 suffix.
-multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> pattern = []> {
- def _e64 : VOP3_Pseudo<opName, P, pattern>;
-}
-
-class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
- VOP3_Pseudo<OpName, P,
- !if(P.HasOpSel,
- !if(P.HasModifiers,
- getVOP3OpSelModPat<P, node>.ret,
- getVOP3OpSelPat<P, node>.ret),
- !if(P.HasModifiers,
- getVOP3ModPat<P, node>.ret,
- !if(P.HasIntClamp,
- getVOP3ClampPat<P, node>.ret,
- !if (P.IsMAI,
- getVOP3MAIPat<P, node>.ret,
- getVOP3Pat<P, node>.ret)))),
- 0, P.HasOpSel> {
-
- let IntClamp = P.HasIntClamp;
- let AsmMatchConverter =
- !if(P.HasOpSel,
- "cvtVOP3OpSel",
- !if(!or(P.HasModifiers, P.HasOMod, P.HasIntClamp),
- "cvtVOP3",
- ""));
-}
-
-multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> {
- def _e64 : VOP3InstBase<OpName, P, node>;
-}
-
// Special case for v_div_fmas_{f32|f64}, since it seems to be the
// only VOP instruction that implicitly reads VCC.
let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in {
def VOP_F32_F32_F32_F32_VCC : VOPProfile<[f32, f32, f32, f32]> {
let Outs64 = (outs DstRC.RegClass:$vdst);
+ let HasExtVOP3DPP = 0;
+ let HasExtDPP = 0;
}
def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> {
let Outs64 = (outs DstRC.RegClass:$vdst);
}
}
-class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> {
- bit HasClamp = Clamp;
- bit HasOpSel = OpSel;
- bit IsPacked = Packed;
- bit IsMAI = MAI;
-}
-
-def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>;
-def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>;
-def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>;
-def VOP3_PACKED : VOP3Features<1, 1, 1, 0>;
-def VOP3_MAI : VOP3Features<0, 0, 0, 1>;
-
-class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
-
- let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
- let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
- let IsMAI = !if(Features.IsMAI, 1, P.IsMAI);
- let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
-
- let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers));
- let IsSingle = 1;
-}
-
class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod";
let IsSingle = 1;
+ let HasExtVOP3DPP = 0;
+ let HasExtDPP = 0;
}
def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>;
@@ -198,12 +32,22 @@ def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>;
def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
let HasClamp = 1;
- let IsSingle = 1;
+ let IsSingle = 1;
let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
}
+class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> {
+ let HasExtVOP3DPP = 0;
+ let HasExtDPP = 0;
+}
+
+def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> {
+ let HasExtVOP3DPP = 0;
+ let HasExtDPP = 0;
+}
+
//===----------------------------------------------------------------------===//
// VOP3 INTERP
//===----------------------------------------------------------------------===//
@@ -304,10 +148,10 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_l
} // End SchedRW = [WriteDoubleAdd]
let SchedRW = [WriteIntMul] in {
-defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, DivergentBinFrag<mul>>;
-defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
-defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
-defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
+defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", V_MUL_PROF<VOP_I32_I32_I32>, DivergentBinFrag<mul>>;
+defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu>;
+defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>;
+defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>;
} // End SchedRW = [WriteIntMul]
} // End isReMaterializable = 1
@@ -367,7 +211,7 @@ let isCommutable = 1 in {
} // End isCommutable = 1
defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
-defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
+defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdiv_fixup>;
let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
@@ -419,9 +263,9 @@ def : GCNPat<
>;
let isReMaterializable = 1 in {
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
+let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-} // End SubtargetPredicate = isGFX6GFX7GFX10
+} // End SubtargetPredicate = isGFX6GFX7GFX10Plus
let SchedRW = [Write32Bit] in {
let SubtargetPredicate = isGFX8Plus in {
@@ -430,21 +274,30 @@ defm V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMD
} // End SchedRW = [Write32Bit]
} // End isReMaterializable = 1
-let SubtargetPredicate = isGFX7Plus in {
+def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> {
+ let HasModifiers = 0;
+}
+let SubtargetPredicate = isGFX7Plus in {
let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
-defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>;
+defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>;
} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
+} // End SubtargetPredicate = isGFX7Plus
let isCommutable = 1 in {
let SchedRW = [WriteIntMul, WriteSALU] in {
+let SubtargetPredicate = isGFX7GFX8GFX9GFX10 in {
defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
+}
+let SubtargetPredicate = isGFX11Only, Constraints = "@earlyclobber $vdst" in {
+defm V_MAD_U64_U32_gfx11 : VOP3Inst <"v_mad_u64_u32_gfx11", VOP3b_I64_I1_I32_I32_I64>;
+defm V_MAD_I64_I32_gfx11 : VOP3Inst <"v_mad_i64_i32_gfx11", VOP3b_I64_I1_I32_I32_I64>;
+} // End SubtargetPredicate = isGFX11Only, Constraints = "@earlyclobber $vdst"
} // End SchedRW = [WriteIntMul, WriteSALU]
} // End isCommutable = 1
-} // End SubtargetPredicate = isGFX7Plus
let FPDPRounding = 1 in {
let Predicates = [Has16BitInsts, isGFX8Only] in {
@@ -557,7 +410,7 @@ defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64>;
} // End Predicates = [Has16BitInsts, isGFX10Plus]
-class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
+class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
(ops node:$x, node:$y, node:$z),
// When the inner operation is used multiple times, selecting 3-op
// instructions may still be beneficial -- if the other users can be
@@ -587,7 +440,9 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
return true;
}]> {
let PredicateCodeUsesOperands = 1;
+}
+class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDAG<op1, op2> {
// The divergence predicate is irrelevant in GlobalISel, as we have
// proper register bank checks. We just need to verify the constant
// bus restriction when all the sources are considered.
@@ -609,6 +464,23 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
}];
}
+def shl_0_to_4 : PatFrag<
+ (ops node:$src0, node:$src1), (shl node:$src0, node:$src1),
+ [{
+ if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ return C->getZExtValue() <= 4;
+ }
+ return false;
+ }]> {
+ let GISelPredicateCode = [{
+ int64_t Imm = 0;
+ if (!mi_match(MI.getOperand(2).getReg(), MRI, m_ICst(Imm)) &&
+ !mi_match(MI.getOperand(2).getReg(), MRI, m_Copy(m_ICst(Imm))))
+ return false;
+ return (uint64_t)Imm <= 4;
+ }];
+}
+
let SubtargetPredicate = isGFX9Plus in {
let isCommutable = 1, isReMaterializable = 1 in {
defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -649,6 +521,10 @@ defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I
defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
} // End isReMaterializable = 1
+// V_LSHL_ADD_U64: D0.u64 = (S0.u64 << S1.u[2:0]) + S2.u64
+// src0 is shifted left by 0-4 (use “0” to get ADD_U64).
+let SubtargetPredicate = isGFX940Plus in
+defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I64>>;
class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
// This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
@@ -664,6 +540,12 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
+let SubtargetPredicate = isGFX940Plus in
+def : GCNPat<
+ (ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
+ (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
+>;
+
def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>;
@@ -688,6 +570,33 @@ def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
} // End SubtargetPredicate = isGFX9Plus
+// FIXME: GlobalISel in general does not handle instructions with 2 results,
+// so it cannot use these patterns.
+multiclass IMAD32_Pats <VOP3_Pseudo inst> {
+ def : GCNPat <
+ (ThreeOpFrag<mul, add> i32:$src0, i32:$src1, i32:$src2),
+ (EXTRACT_SUBREG (inst $src0, $src1,
+ (REG_SEQUENCE SReg_64, // Use scalar and let it be legalized
+ $src2, sub0,
+ (i32 (IMPLICIT_DEF)), sub1),
+ 0 /* clamp */),
+ sub0)
+ >;
+ // Immediate src2 in the pattern above will not fold because it would be partially
+ // undef. Hence define specialized pattern for this case.
+ // FIXME: GlobalISel pattern exporter fails to export a pattern like this and asserts,
+ // make it SDAG only.
+ def : GCNPat <
+ (ThreeOpFragSDAG<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)),
+ (EXTRACT_SUBREG (inst $src0, $src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0)
+ >;
+}
+
+let SubtargetPredicate = isGFX9GFX10 in // exclude pre-GFX9 where it was slow
+defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
+let SubtargetPredicate = isGFX11Only in
+defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
+
def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
let Src0RC64 = VRegSrc_32;
let Src1RC64 = SCSrc_b32;
@@ -697,6 +606,8 @@ def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3
IntOpSelMods:$src2_modifiers, SCSrc_b32:$src2,
VGPR_32:$vdst_in, op_sel0:$op_sel);
let HasClamp = 0;
+ let HasExtVOP3DPP = 0;
+ let HasExtDPP = 0;
}
class PermlanePat<SDPatternOperator permlane,
@@ -753,6 +664,20 @@ let SubtargetPredicate = isGFX10Plus in {
def : PermlaneDiscardVDstIn<
BoundControlOrFetchInvalidPermlane<int_amdgcn_permlanex16>,
V_PERMLANEX16_B32_e64>;
+
+ defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
+ defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
+
+ def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>;
+ def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_e64>;
+
+ // Undo sub x, c -> add x, -c canonicalization since c is more likely
+ // an inline immediate than -c.
+ def : GCNPat<
+ (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
+ (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0)
+ >;
+
} // End SubtargetPredicate = isGFX10Plus
class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
@@ -773,6 +698,36 @@ def : DivFmasPat<f32, V_DIV_FMAS_F32_e64, VCC_LO>;
def : DivFmasPat<f64, V_DIV_FMAS_F64_e64, VCC_LO>;
}
+class VOP3_DOT_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile<P, Features> {
+ // FIXME VOP3 DPP versions are unsupported
+ let HasExtVOP3DPP = 0;
+ let HasClamp = 0;
+ let HasOMod = 0;
+ let InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64,
+ NumSrcArgs, HasClamp, HasOMod,
+ !if(isFloatType<Src0VT>.ret, FPVRegInputMods, IntOpSelMods),
+ !if(isFloatType<Src1VT>.ret, FPVRegInputMods, IntOpSelMods),
+ !if(isFloatType<Src2VT>.ret, FPVRegInputMods, IntOpSelMods)>.ret;
+}
+
+let SubtargetPredicate = isGFX11Plus in {
+ defm V_MAXMIN_F32 : VOP3Inst<"v_maxmin_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+ defm V_MINMAX_F32 : VOP3Inst<"v_minmax_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+ defm V_MAXMIN_F16 : VOP3Inst<"v_maxmin_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
+ defm V_MINMAX_F16 : VOP3Inst<"v_minmax_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
+ defm V_MAXMIN_U32 : VOP3Inst<"v_maxmin_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_MINMAX_U32 : VOP3Inst<"v_minmax_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_MAXMIN_I32 : VOP3Inst<"v_maxmin_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_MINMAX_I32 : VOP3Inst<"v_minmax_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_CVT_PK_I16_F32 : VOP3Inst<"v_cvt_pk_i16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
+ defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
+} // End SubtargetPredicate = isGFX11Plus
+
+let SubtargetPredicate = HasDot8Insts in {
+ defm V_DOT2_F16_F16 : VOP3Inst<"v_dot2_f16_f16", VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>, int_amdgcn_fdot2_f16_f16>;
+ defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>;
+}
+
//===----------------------------------------------------------------------===//
// Integer Clamp Patterns
//===----------------------------------------------------------------------===//
@@ -813,16 +768,137 @@ def : IntClampPat<V_MQSAD_PK_U16_U8_e64, int_amdgcn_mqsad_pk_u16_u8>;
def : IntClampPat<V_QSAD_PK_U16_U8_e64, int_amdgcn_qsad_pk_u16_u8>;
def : IntClampPat<V_MQSAD_U32_U8_e64, int_amdgcn_mqsad_u32_u8>;
-
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+defm V_FMA_DX9_ZERO_F32 : VOP3_Real_with_name_gfx11<0x209, "V_FMA_LEGACY_F32", "v_fma_dx9_zero_f32">;
+defm V_MAD_I32_I24 : VOP3_Realtriple_gfx11<0x20a>;
+defm V_MAD_U32_U24 : VOP3_Realtriple_gfx11<0x20b>;
+defm V_CUBEID_F32 : VOP3_Realtriple_gfx11<0x20c>;
+defm V_CUBESC_F32 : VOP3_Realtriple_gfx11<0x20d>;
+defm V_CUBETC_F32 : VOP3_Realtriple_gfx11<0x20e>;
+defm V_CUBEMA_F32 : VOP3_Realtriple_gfx11<0x20f>;
+defm V_BFE_U32 : VOP3_Realtriple_gfx11<0x210>;
+defm V_BFE_I32 : VOP3_Realtriple_gfx11<0x211>;
+defm V_BFI_B32 : VOP3_Realtriple_gfx11<0x212>;
+defm V_FMA_F32 : VOP3_Realtriple_gfx11<0x213>;
+defm V_FMA_F64 : VOP3_Real_Base_gfx11<0x214>;
+defm V_LERP_U8 : VOP3_Realtriple_gfx11<0x215>;
+defm V_ALIGNBIT_B32 : VOP3_Realtriple_gfx11<0x216>;
+defm V_ALIGNBYTE_B32 : VOP3_Realtriple_gfx11<0x217>;
+defm V_MULLIT_F32 : VOP3_Realtriple_gfx11<0x218>;
+defm V_MIN3_F32 : VOP3_Realtriple_gfx11<0x219>;
+defm V_MIN3_I32 : VOP3_Realtriple_gfx11<0x21a>;
+defm V_MIN3_U32 : VOP3_Realtriple_gfx11<0x21b>;
+defm V_MAX3_F32 : VOP3_Realtriple_gfx11<0x21c>;
+defm V_MAX3_I32 : VOP3_Realtriple_gfx11<0x21d>;
+defm V_MAX3_U32 : VOP3_Realtriple_gfx11<0x21e>;
+defm V_MED3_F32 : VOP3_Realtriple_gfx11<0x21f>;
+defm V_MED3_I32 : VOP3_Realtriple_gfx11<0x220>;
+defm V_MED3_U32 : VOP3_Realtriple_gfx11<0x221>;
+defm V_SAD_U8 : VOP3_Realtriple_gfx11<0x222>;
+defm V_SAD_HI_U8 : VOP3_Realtriple_gfx11<0x223>;
+defm V_SAD_U16 : VOP3_Realtriple_gfx11<0x224>;
+defm V_SAD_U32 : VOP3_Realtriple_gfx11<0x225>;
+defm V_CVT_PK_U8_F32 : VOP3_Realtriple_gfx11<0x226>;
+defm V_DIV_FIXUP_F32 : VOP3_Real_Base_gfx11<0x227>;
+defm V_DIV_FIXUP_F64 : VOP3_Real_Base_gfx11<0x228>;
+defm V_DIV_FMAS_F32 : VOP3_Real_Base_gfx11<0x237>;
+defm V_DIV_FMAS_F64 : VOP3_Real_Base_gfx11<0x238>;
+defm V_MSAD_U8 : VOP3_Realtriple_gfx11<0x239>;
+defm V_QSAD_PK_U16_U8 : VOP3_Real_Base_gfx11<0x23a>;
+defm V_MQSAD_PK_U16_U8 : VOP3_Real_Base_gfx11<0x23b>;
+defm V_MQSAD_U32_U8 : VOP3_Real_Base_gfx11<0x23d>;
+defm V_XOR3_B32 : VOP3_Realtriple_gfx11<0x240>;
+defm V_MAD_U16 : VOP3_Realtriple_with_name_gfx11<0x241, "V_MAD_U16_gfx9", "v_mad_u16">;
+defm V_PERM_B32 : VOP3_Realtriple_gfx11<0x244>;
+defm V_XAD_U32 : VOP3_Realtriple_gfx11<0x245>;
+defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11<0x246>;
+defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11<0x247>;
+defm V_FMA_F16 : VOP3_Realtriple_with_name_gfx11<0x248, "V_FMA_F16_gfx9", "v_fma_f16">;
+defm V_MIN3_F16 : VOP3_Realtriple_gfx11<0x249>;
+defm V_MIN3_I16 : VOP3_Realtriple_gfx11<0x24a>;
+defm V_MIN3_U16 : VOP3_Realtriple_gfx11<0x24b>;
+defm V_MAX3_F16 : VOP3_Realtriple_gfx11<0x24c>;
+defm V_MAX3_I16 : VOP3_Realtriple_gfx11<0x24d>;
+defm V_MAX3_U16 : VOP3_Realtriple_gfx11<0x24e>;
+defm V_MED3_F16 : VOP3_Realtriple_gfx11<0x24f>;
+defm V_MED3_I16 : VOP3_Realtriple_gfx11<0x250>;
+defm V_MED3_U16 : VOP3_Realtriple_gfx11<0x251>;
+defm V_MAD_I16 : VOP3_Realtriple_with_name_gfx11<0x253, "V_MAD_I16_gfx9", "v_mad_i16">;
+defm V_DIV_FIXUP_F16 : VOP3_Realtriple_with_name_gfx11<0x254, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">;
+defm V_ADD3_U32 : VOP3_Realtriple_gfx11<0x255>;
+defm V_LSHL_OR_B32 : VOP3_Realtriple_gfx11<0x256>;
+defm V_AND_OR_B32 : VOP3_Realtriple_gfx11<0x257>;
+defm V_OR3_B32 : VOP3_Realtriple_gfx11<0x258>;
+defm V_MAD_U32_U16 : VOP3_Realtriple_gfx11<0x259>;
+defm V_MAD_I32_I16 : VOP3_Realtriple_gfx11<0x25a>;
+defm V_PERMLANE16_B32 : VOP3_Real_Base_gfx11<0x25b>;
+defm V_PERMLANEX16_B32 : VOP3_Real_Base_gfx11<0x25c>;
+defm V_MAXMIN_F32 : VOP3_Realtriple_gfx11<0x25e>;
+defm V_MINMAX_F32 : VOP3_Realtriple_gfx11<0x25f>;
+defm V_MAXMIN_F16 : VOP3_Realtriple_gfx11<0x260>;
+defm V_MINMAX_F16 : VOP3_Realtriple_gfx11<0x261>;
+defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11<0x262>;
+defm V_MINMAX_U32 : VOP3_Realtriple_gfx11<0x263>;
+defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11<0x264>;
+defm V_MINMAX_I32 : VOP3_Realtriple_gfx11<0x265>;
+// FIXME VOP3 DPP Dot instructions are unsupported
+defm V_DOT2_F16_F16 : VOP3_Real_Base_gfx11<0x266>;
+defm V_DOT2_BF16_BF16 : VOP3_Real_Base_gfx11<0x267>;
+defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">;
+defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">;
+defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">;
+defm V_MAD_I64_I32_gfx11 : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">;
+defm V_ADD_NC_U16 : VOP3Only_Realtriple_gfx11<0x303>;
+defm V_SUB_NC_U16 : VOP3Only_Realtriple_gfx11<0x304>;
+defm V_MUL_LO_U16 : VOP3Only_Realtriple_gfx11<0x305>;
+defm V_CVT_PK_I16_F32 : VOP3_Realtriple_gfx11<0x306>;
+defm V_CVT_PK_U16_F32 : VOP3_Realtriple_gfx11<0x307>;
+defm V_MAX_U16 : VOP3Only_Realtriple_gfx11<0x309>;
+defm V_MAX_I16 : VOP3Only_Realtriple_gfx11<0x30a>;
+defm V_MIN_U16 : VOP3Only_Realtriple_gfx11<0x30b>;
+defm V_MIN_I16 : VOP3Only_Realtriple_gfx11<0x30c>;
+defm V_ADD_NC_I16 : VOP3_Realtriple_with_name_gfx11<0x30d, "V_ADD_I16", "v_add_nc_i16">;
+defm V_SUB_NC_I16 : VOP3_Realtriple_with_name_gfx11<0x30e, "V_SUB_I16", "v_sub_nc_i16">;
+defm V_PACK_B32_F16 : VOP3_Realtriple_gfx11<0x311>;
+defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_with_name_gfx11<0x312, "V_CVT_PKNORM_I16_F16" , "v_cvt_pk_norm_i16_f16" >;
+defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_with_name_gfx11<0x313, "V_CVT_PKNORM_U16_F16" , "v_cvt_pk_norm_u16_f16" >;
+defm V_SUB_NC_I32 : VOP3_Realtriple_with_name_gfx11<0x325, "V_SUB_I32", "v_sub_nc_i32">;
+defm V_ADD_NC_I32 : VOP3_Realtriple_with_name_gfx11<0x326, "V_ADD_I32", "v_add_nc_i32">;
+defm V_ADD_F64 : VOP3_Real_Base_gfx11<0x327>;
+defm V_MUL_F64 : VOP3_Real_Base_gfx11<0x328>;
+defm V_MIN_F64 : VOP3_Real_Base_gfx11<0x329>;
+defm V_MAX_F64 : VOP3_Real_Base_gfx11<0x32a>;
+defm V_LDEXP_F64 : VOP3_Real_Base_gfx11<0x32b>;
+defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11<0x32c>;
+defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11<0x32d>;
+defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11<0x32e>;
+defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11<0x32f>;
+defm V_LSHLREV_B16 : VOP3Only_Realtriple_gfx11<0x338>;
+defm V_LSHRREV_B16 : VOP3Only_Realtriple_gfx11<0x339>;
+defm V_ASHRREV_I16 : VOP3Only_Realtriple_gfx11<0x33a>;
+defm V_LSHLREV_B64 : VOP3_Real_Base_gfx11<0x33c>;
+defm V_LSHRREV_B64 : VOP3_Real_Base_gfx11<0x33d>;
+defm V_ASHRREV_I64 : VOP3_Real_Base_gfx11<0x33e>;
+defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx11<0x360>; // Pseudo in VOP2
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
+ defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx11<0x361>; // Pseudo in VOP2
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
+defm V_AND_B16 : VOP3Only_Realtriple_gfx11<0x362>;
+defm V_OR_B16 : VOP3Only_Realtriple_gfx11<0x363>;
+defm V_XOR_B16 : VOP3Only_Realtriple_gfx11<0x364>;
+
+//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
multiclass VOP3_Real_gfx10<bits<10> op> {
def _gfx10 :
VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
@@ -867,7 +943,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
let AsmString = asmName # ps.AsmOperands;
}
}
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>;
@@ -935,10 +1011,11 @@ defm V_MAD_I16 :
defm V_DIV_FIXUP_F16 :
VOP3OpSel_Real_gfx10_with_name<0x35f, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">;
+defm V_ADD_NC_U16 : VOP3OpSel_Real_gfx10<0x303>;
+defm V_SUB_NC_U16 : VOP3OpSel_Real_gfx10<0x304>;
+
// FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these
// (they do not support SDWA or DPP).
-defm V_ADD_NC_U16 : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16", "v_add_nc_u16">;
-defm V_SUB_NC_U16 : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16", "v_sub_nc_u16">;
defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16", "v_mul_lo_u16">;
defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16", "v_lshrrev_b16">;
defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16", "v_ashrrev_i16">;
@@ -1273,3 +1350,5 @@ defm V_MAD_I32_I16 : VOP3OpSel_Real_gfx9 <0x1f2>;
defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx9 <0x299>;
defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>;
+
+defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 707475ceccee..59ce532af59b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -10,19 +10,33 @@
// VOP3P Classes
//===----------------------------------------------------------------------===//
+class VOP3P_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
+ bit HasDPP = 0> : VOP3_Profile<P, Features> {
+ let IsVOP3P = 1;
+ let HasExtVOP3DPP = HasDPP;
+ // We do not want to print src modifiers for vop3p because the bits are
+ // overloaded in meaning and the logic in printOperandAndFPInputMods is
+ // wrong for vop3p
+ let AsmVOP3DPPBase = AsmVOP3P;
+}
+
// Used for FMA_MIX* and MAD_MIX* insts
// Their operands are only sort of f16 operands. Depending on
// op_sel_hi, these may be interpreted as f32. The inline immediate
// values are really f16 converted to f32, so we treat these as f16
// operands.
class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
- bit useTiedOutput = 0> : VOP3_Profile<P, Features> {
+ bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> {
bit UseTiedOutput = useTiedOutput;
dag srcs =
(ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+ dag dpp_srcs =
+ (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0,
+ FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
+ FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
// FIXME: clampmod0 misbehaves with the non-default vdst_in
// following it. For now workaround this by requiring clamp
@@ -35,19 +49,27 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
// We use Ins64 because that is the one which populates InOperandList
// due to the logic in class VOP3_Pseudo
let Ins64 = !con(srcs, mods);
+ let InsVOP3Base = !con(dpp_srcs, mods);
let Asm64 =
"$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
+ let AsmVOP3DPPBase = Asm64;
}
multiclass VOP3PInst<string OpName, VOPProfile P,
- SDPatternOperator node = null_frag, bit HasExplicitClamp = 0> {
+ SDPatternOperator node = null_frag, bit IsDOT = 0> {
def NAME : VOP3P_Pseudo<OpName, P,
!if (P.HasModifiers,
- getVOP3PModPat<P, node, HasExplicitClamp>.ret,
+ getVOP3PModPat<P, node, IsDOT, IsDOT>.ret,
getVOP3Pat<P, node>.ret)>;
+ let SubtargetPredicate = isGFX11Plus in {
+ if P.HasExtVOP3DPP then
+ def _dpp : VOP3_DPP_Pseudo<OpName, P> {
+ let VOP3P = 1;
+ let PseudoInstr = OpName #"_dpp";
+ }
+ } // end SubtargetPredicate = isGFX11Plus
}
-
// Non-packed instructions that use the VOP3P encoding.
// VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed.
multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
@@ -55,37 +77,47 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
}
+ let SubtargetPredicate = isGFX11Plus in {
+ if P.HasExtVOP3DPP then
+ def _dpp : VOP3_DPP_Pseudo<OpName, P> {
+ let VOP3P = 1;
+ let PseudoInstr = OpName#"_dpp";
+ let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
+ let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
+ }
+ } // end SubtargetPredicate = isGFX11Plus
}
+let isReMaterializable = 1 in {
let isCommutable = 1 in {
-defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
-defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
let FPDPRounding = 1 in {
-defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
-defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
-defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
+defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
+defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
+defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
} // End FPDPRounding = 1
-defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
-defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
+defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
+defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
-defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
-defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
+defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, add>;
+defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>;
+defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
-defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
-defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
-defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
-defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
+defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
+defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
+defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
+defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
}
-defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
-
-defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>;
-defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>;
-defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>;
+defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>;
+defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
+defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>;
+defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>;
+defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>;
+} // End isReMaterializable = 1
let SubtargetPredicate = HasVOP3PInsts in {
@@ -178,6 +210,7 @@ let SubtargetPredicate = HasMadMixInsts in {
// Size of src arguments (16/32) is controlled by op_sel.
// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
let isCommutable = 1, mayRaiseFPException = 0 in {
+let isReMaterializable = 1 in
defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
let FPDPRounding = 1 in {
@@ -197,6 +230,8 @@ defm : MadFmaMixPats<fmad, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
// Essentially the same as the mad_mix versions
let SubtargetPredicate = HasFmaMixInsts in {
let isCommutable = 1 in {
+
+let isReMaterializable = 1 in
defm V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
let FPDPRounding = 1 in {
@@ -297,34 +332,63 @@ let IsDOT = 1 in {
let SubtargetPredicate = HasDot2Insts in {
defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
- VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
+ VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
- VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
+ VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
} // End SubtargetPredicate = HasDot2Insts
let SubtargetPredicate = HasDot7Insts in {
defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
- VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
+ VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
AMDGPUfdot2, 1/*ExplicitClamp*/>;
defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
- VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
- VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
} // End SubtargetPredicate = HasDot7Insts
let SubtargetPredicate = HasDot1Insts in {
defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
- VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
- VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
} // End SubtargetPredicate = HasDot1Insts
+
+let SubtargetPredicate = HasDot8Insts in {
+
+defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16",
+ VOP3P_Profile<VOP_F32_V2I16_V2I16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
+ int_amdgcn_fdot2_f32_bf16, 1>;
+
+} // End SubtargetPredicate = HasDot8Insts
+
} // End let IsDOT = 1
+multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
+ let IsDOT = 1 in
+ defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>,
+ null_frag, 1>;
+ // Dot-iu instructions consider input as signed if imod neg bits are set. Thus
+ // Dot-iu Intrinsics have extra operands and require separate codegen pattern.
+ def : GCNPat < (intrinsic_node (DotIUVOP3PMods i32:$src0_mods), i32:$src0,
+ (DotIUVOP3PMods i32:$src1_mods), i32:$src1,
+ i32:$src2, (i1 timm:$clamp)),
+ (!cast<Instruction>(NAME) $src0_mods, i32:$src0,
+ $src1_mods, i32:$src1,
+ (i32 8), i32:$src2, i1:$clamp)
+ >;
+}
+
+let SubtargetPredicate = HasDot8Insts in {
+defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>;
+defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>;
+} // End SubtargetPredicate = HasDot8Insts
+
def : UDot2Pat<V_DOT2_U32_U16>;
def : SDot2Pat<V_DOT2_I32_I16>;
@@ -365,18 +429,18 @@ def VDst_256 : VOPDstOperand<VReg_256>;
def VDst_512 : VOPDstOperand<VReg_512>;
def VDst_1024 : VOPDstOperand<VReg_1024>;
-def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
+def VOPProfileAccRead : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> {
let Src0RC64 = ARegSrc_32;
}
-def VOPProfileAccWrite : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
+def VOPProfileAccWrite : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> {
let DstRC = ADst_32;
- let Src0RC64 = VISrc_b32;
+ let Src0RC64 = VCSrc_b32;
}
class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC,
RegisterOperand SrcABRC = AVSrc_32>
- : VOP3_Profile<P, VOP3_MAI> {
+ : VOP3P_Profile<P, VOP3_MAI> {
let DstRC = _DstRC;
let Src0RC64 = SrcABRC;
let Src1RC64 = SrcABRC;
@@ -387,15 +451,27 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC
let HasOMod = 0;
let HasModifiers = 0;
let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp";
+ let AsmVOP3DPPBase = Asm64;
let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
+ let InsVOP3Base = Ins64;
// Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs.
// We then create two versions of the instruction: with tied dst and src2
- // and with the eralyclobber flag on the dst. This is strciter than the
+ // and with the earlyclobber flag on the dst. This is stricter than the
// actual HW restriction. In particular earlyclobber also affects src0 and
// src1 allocation which is not required.
bit NoDstOverlap = !gt(DstVT.Size, 128);
}
+class VOPProfileSMFMAC<VOPProfile P, RegisterOperand _DstRC,
+ RegisterOperand _SrcARC, RegisterOperand _SrcBRC>
+ : VOPProfileMAI<P, _DstRC, _DstRC, _SrcARC> {
+ let Src1RC64 = _SrcBRC;
+ let Src2VT = DstVT;
+ let Asm64 = " $vdst, $src0, $src1, $idx$cbsz$abid";
+ let Outs64 = (outs DstRC:$vdst);
+ let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, VRegSrc_32:$idx, cbsz:$cbsz, abid:$abid, Src2RC64:$src2);
+}
+
def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, AISrc_128_f32, ADst_128>;
def VOPProfileMAI_F32_F32_X16 : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, AISrc_512_f32, ADst_512>;
def VOPProfileMAI_F32_F32_X32 : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, AISrc_1024_f32, ADst_1024>;
@@ -413,6 +489,10 @@ def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, A
def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, AISrc_256_f64, ADst_256, AVSrc_64>;
def VOPProfileMAI_F64_4X4X4F64 : VOPProfileMAI<VOP_F64_F64_F64_F64, AISrc_64_f64, ADst_64, AVSrc_64>;
+def VOPProfileMAI_I32_I64_X16 : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, AISrc_128_b32, ADst_128, AVSrc_64>;
+def VOPProfileMAI_I32_I64_X32 : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, AISrc_512_b32, ADst_512, AVSrc_64>;
+def VOPProfileMAI_F32_V2F32_X16 : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
+def VOPProfileMAI_F32_V2F32_X32 : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, VISrc_128_f32, VDst_128>;
def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, VISrc_512_f32, VDst_512>;
@@ -431,12 +511,37 @@ def VOPProfileMAI_F32_V4I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F
def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, VISrc_256_f64, VDst_256, AVSrc_64>;
def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI<VOP_F64_F64_F64_F64, VISrc_64_f64, VDst_64, AVSrc_64>;
+def VOPProfileMAI_I32_I64_X16_VCD : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, VISrc_128_b32, VDst_128, AVSrc_64>;
+def VOPProfileMAI_I32_I64_X32_VCD : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, VISrc_512_b32, VDst_512, AVSrc_64>;
+def VOPProfileMAI_F32_V2F32_X16_VCD : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
+def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
+
+def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;
class MFMATable <bit is_mac, string Name> {
bit IsMac = is_mac;
string FMAOp = Name;
}
+class MAIFrag<SDPatternOperator Op, code pred> : PatFrag <
+ (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$abid, node:$blgp),
+ (Op $src0, $src1, $src2, $cbsz, $abid, $blgp),
+ pred
+>;
+
+let GISelPredicateCode = [{ return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }] in
+class AgprMAIFrag<SDPatternOperator Op> :
+ MAIFrag<Op, [{ return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }]>;
+
+let GISelPredicateCode = [{ return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }] in
+class VgprMAIFrag<SDPatternOperator Op> :
+ MAIFrag<Op, [{ return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }]>;
+
let Predicates = [HasMAIInsts] in {
let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
@@ -446,47 +551,62 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
} // End isMoveImm = 1
} // End isAsCheapAsAMove = 1, isReMaterializable = 1
+class MAIInst<string OpName, VOPProfile P, SDPatternOperator node>
+ : VOP3InstBase<OpName, P, node> {
+ Instruction Opcode = !cast<Instruction>(NAME);
+ bit is_dgemm = 0;
+ bit is_gfx940_xdl = 0;
+}
+
multiclass MAIInst<string OpName, string P, SDPatternOperator node,
bit NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap> {
let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in {
- defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), !if(NoDstOverlap, null_frag, node)>,
- MFMATable<0, NAME # "_e64">;
+ def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P),
+ !if(NoDstOverlap, null_frag, AgprMAIFrag<node>)>,
+ MFMATable<0, NAME # "_e64">;
let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in
- defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>,
- MFMATable<0, NAME # "_vgprcd_e64">;
+ def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
+ !if(NoDstOverlap, null_frag, VgprMAIFrag<node>)>,
+ MFMATable<0, NAME # "_vgprcd_e64">;
}
foreach _ = BoolToList<NoDstOverlap>.ret in {
let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""),
isConvertibleToThreeAddress = NoDstOverlap,
Mnemonic = OpName in {
- defm "_mac" : VOP3Inst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>,
- MFMATable<1, NAME # "_e64">;
+ def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), AgprMAIFrag<node>>,
+ MFMATable<1, NAME # "_e64">;
let SubtargetPredicate = isGFX90APlus in
- defm _mac_vgprcd : VOP3Inst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>,
- MFMATable<1, NAME # "_vgprcd_e64">;
+ def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
+ VgprMAIFrag<node>>,
+ MFMATable<1, NAME # "_vgprcd_e64">;
}
}
} // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
}
defm V_MFMA_F32_4X4X1F32 : MAIInst<"v_mfma_f32_4x4x1f32", "F32_F32_X4", int_amdgcn_mfma_f32_4x4x1f32>;
-defm V_MFMA_F32_4X4X4F16 : MAIInst<"v_mfma_f32_4x4x4f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_4x4x4f16>;
-defm V_MFMA_I32_4X4X4I8 : MAIInst<"v_mfma_i32_4x4x4i8", "I32_I32_X4", int_amdgcn_mfma_i32_4x4x4i8>;
defm V_MFMA_F32_16X16X1F32 : MAIInst<"v_mfma_f32_16x16x1f32", "F32_F32_X16", int_amdgcn_mfma_f32_16x16x1f32>;
defm V_MFMA_F32_16X16X4F32 : MAIInst<"v_mfma_f32_16x16x4f32", "F32_F32_X4", int_amdgcn_mfma_f32_16x16x4f32>;
+defm V_MFMA_F32_32X32X1F32 : MAIInst<"v_mfma_f32_32x32x1f32", "F32_F32_X32", int_amdgcn_mfma_f32_32x32x1f32>;
+defm V_MFMA_F32_32X32X2F32 : MAIInst<"v_mfma_f32_32x32x2f32", "F32_F32_X16", int_amdgcn_mfma_f32_32x32x2f32>;
+
+let is_gfx940_xdl = 1 in {
+defm V_MFMA_F32_4X4X4F16 : MAIInst<"v_mfma_f32_4x4x4f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_4x4x4f16>;
+defm V_MFMA_I32_4X4X4I8 : MAIInst<"v_mfma_i32_4x4x4i8", "I32_I32_X4", int_amdgcn_mfma_i32_4x4x4i8>;
defm V_MFMA_F32_16X16X4F16 : MAIInst<"v_mfma_f32_16x16x4f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_16x16x4f16>;
defm V_MFMA_F32_16X16X16F16 : MAIInst<"v_mfma_f32_16x16x16f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_16x16x16f16>;
defm V_MFMA_I32_16X16X4I8 : MAIInst<"v_mfma_i32_16x16x4i8", "I32_I32_X16", int_amdgcn_mfma_i32_16x16x4i8>;
-defm V_MFMA_F32_32X32X1F32 : MAIInst<"v_mfma_f32_32x32x1f32", "F32_F32_X32", int_amdgcn_mfma_f32_32x32x1f32>;
-defm V_MFMA_F32_32X32X2F32 : MAIInst<"v_mfma_f32_32x32x2f32", "F32_F32_X16", int_amdgcn_mfma_f32_32x32x2f32>;
defm V_MFMA_F32_32X32X4F16 : MAIInst<"v_mfma_f32_32x32x4f16", "F32_V4F16_X32", int_amdgcn_mfma_f32_32x32x4f16>;
defm V_MFMA_F32_32X32X8F16 : MAIInst<"v_mfma_f32_32x32x8f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_32x32x8f16>;
defm V_MFMA_I32_32X32X4I8 : MAIInst<"v_mfma_i32_32x32x4i8", "I32_I32_X32", int_amdgcn_mfma_i32_32x32x4i8>;
+}
+
+let Predicates = [isGFX908orGFX90A] in {
defm V_MFMA_I32_16X16X16I8 : MAIInst<"v_mfma_i32_16x16x16i8", "I32_I32_X4", int_amdgcn_mfma_i32_16x16x16i8>;
defm V_MFMA_I32_32X32X8I8 : MAIInst<"v_mfma_i32_32x32x8i8", "I32_I32_X16", int_amdgcn_mfma_i32_32x32x8i8>;
defm V_MFMA_F32_4X4X2BF16 : MAIInst<"v_mfma_f32_4x4x2bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_4x4x2bf16>;
@@ -494,34 +614,314 @@ defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2I16_X16",
defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_16x16x8bf16>;
defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2I16_X32", int_amdgcn_mfma_f32_32x32x2bf16>;
defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_32x32x4bf16>;
+}
} // End SubtargetPredicate = HasMAIInsts
let Predicates = [isGFX90APlus] in {
+ let is_gfx940_xdl = 1 in {
defm V_MFMA_F32_32X32X4BF16_1K : MAIInst<"v_mfma_f32_32x32x4bf16_1k", "F32_V4I16_X32", int_amdgcn_mfma_f32_32x32x4bf16_1k>;
defm V_MFMA_F32_16X16X4BF16_1K : MAIInst<"v_mfma_f32_16x16x4bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_16x16x4bf16_1k>;
defm V_MFMA_F32_4X4X4BF16_1K : MAIInst<"v_mfma_f32_4x4x4bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_4x4x4bf16_1k>;
defm V_MFMA_F32_32X32X8BF16_1K : MAIInst<"v_mfma_f32_32x32x8bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_32x32x8bf16_1k>;
defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_16x16x16bf16_1k>;
+ }
+ let is_dgemm = 1 in {
defm V_MFMA_F64_16X16X4F64 : MAIInst<"v_mfma_f64_16x16x4f64", "F64_16X16X4F64", int_amdgcn_mfma_f64_16x16x4f64>;
defm V_MFMA_F64_4X4X4F64 : MAIInst<"v_mfma_f64_4x4x4f64", "F64_4X4X4F64", int_amdgcn_mfma_f64_4x4x4f64>;
+ }
} // End Predicates = [isGFX90APlus]
-let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in {
- defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
- defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
- defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
- defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
+let Predicates = [isGFX940Plus], is_gfx940_xdl = 1 in {
+ defm V_MFMA_I32_32X32X16I8 : MAIInst<"v_mfma_i32_32x32x16i8", "I32_I64_X32", int_amdgcn_mfma_i32_32x32x16_i8>;
+ defm V_MFMA_I32_16X16X32I8 : MAIInst<"v_mfma_i32_16x16x32i8", "I32_I64_X16", int_amdgcn_mfma_i32_16x16x32_i8>;
+ defm V_MFMA_F32_16X16X8XF32 : MAIInst<"v_mfma_f32_16x16x8xf32", "F32_V2F32_X16", int_amdgcn_mfma_f32_16x16x8_xf32>;
+ defm V_MFMA_F32_32X32X4XF32 : MAIInst<"v_mfma_f32_32x32x4xf32", "F32_V2F32_X32", int_amdgcn_mfma_f32_32x32x4_xf32>;
+} // End Predicates = [isGFX940Plus], is_gfx940_xdl = 1
+
+multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> {
+ let Constraints = "$vdst = $src2", DisableEncoding = "$src2",
+ isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1, is_gfx940_xdl = 1 in {
+ def _e64 : MAIInst<OpName, !cast<VOPProfileSMFMAC>("VOPProfileSMFMAC_" # P), node>;
+ }
+}
+
+let SubtargetPredicate = isGFX940Plus in {
+defm V_SMFMAC_F32_16X16X32_F16 : SMFMACInst<"v_smfmac_f32_16x16x32_f16", "F32_16X16X32_F16", int_amdgcn_smfmac_f32_16x16x32_f16>;
+defm V_SMFMAC_F32_32X32X16_F16 : SMFMACInst<"v_smfmac_f32_32x32x16_f16", "F32_32X32X16_F16", int_amdgcn_smfmac_f32_32x32x16_f16>;
+defm V_SMFMAC_F32_16X16X32_BF16 : SMFMACInst<"v_smfmac_f32_16x16x32_bf16", "F32_16X16X32_I16", int_amdgcn_smfmac_f32_16x16x32_bf16>;
+defm V_SMFMAC_F32_32X32X16_BF16 : SMFMACInst<"v_smfmac_f32_32x32x16_bf16", "F32_32X32X16_I16", int_amdgcn_smfmac_f32_32x32x16_bf16>;
+defm V_SMFMAC_I32_16X16X64_I8 : SMFMACInst<"v_smfmac_i32_16x16x64_i8", "I32_16X16X64_I8", int_amdgcn_smfmac_i32_16x16x64_i8>;
+defm V_SMFMAC_I32_32X32X32_I8 : SMFMACInst<"v_smfmac_i32_32x32x32_i8", "I32_32X32X32_I8", int_amdgcn_smfmac_i32_32x32x32_i8>;
+}
+
+def MAIInstInfoTable : GenericTable {
+ let FilterClass = "MAIInst";
+ let CppTypeName = "MAIInstInfo";
+ let Fields = [
+ "Opcode", "is_dgemm", "is_gfx940_xdl"
+ ];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getMAIInstInfoHelper";
+}
+
+let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1, isReMaterializable = 1 in {
+ defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
+ defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
+ defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
+ defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
} // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1
def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
+class VOPProfileWMMA<VOPProfile P, string Suffix, RegisterOperand _Src01RC64, bit _HasClamp, bit _HasOpSel> : VOP3P_Profile<P> {
+ let DstRC = !if(!eq(Suffix, "_w32"), VDst_256, VDst_128);
+ let Src0RC64 = _Src01RC64;
+ let Src1RC64 = _Src01RC64;
+ let Src2RC64 = !if(!eq(Suffix, "_w32"), VISrc_256_f64, VISrc_128_f32);
+ let HasClamp = _HasClamp;
+ let HasOpSel = _HasOpSel;
+ let IsPacked = 1;
+ let IsWMMA = 1;
+}
+
+def VOP_V8F32_V16F16_V16F16_V8F32 : VOPProfile <[v8f32, v16f16, v16f16, v8f32]>;
+def VOP_V8F32_V16I16_V16I16_V8F32 : VOPProfile <[v8f32, v16i16, v16i16, v8f32]>;
+def VOP_V16F16_V16F16_V16F16_V16F16 : VOPProfile <[v16f16, v16f16, v16f16, v16f16]>;
+def VOP_V16I16_V16I16_V16I16_V16I16 : VOPProfile <[v16i16, v16i16, v16i16, v16i16]>;
+def VOP_V8I32_V4I32_V4I32_V8I32 : VOPProfile <[v8i32, v4i32, v4i32, v8i32]>;
+def VOP_V8I32_V2I32_V2I32_V8I32 : VOPProfile <[v8i32, v2i32, v2i32, v8i32]>;
+
+def VOP_V4F32_V16F16_V16F16_V4F32 : VOPProfile <[v4f32, v16f16, v16f16, v4f32]>;
+def VOP_V4F32_V16I16_V16I16_V4F32 : VOPProfile <[v4f32, v16i16, v16i16, v4f32]>;
+def VOP_V8F16_V16F16_V16F16_V8F16 : VOPProfile <[v8f16, v16f16, v16f16, v8f16]>;
+def VOP_V8I16_V16I16_V16I16_V8I16 : VOPProfile <[v8i16, v16i16, v16i16, v8i16]>;
+def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>;
+def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>;
+
+
+class WMMAType <bits<2> val> {
+ bit hasClamp = val{0};
+ bit hasOpsel = val{1};
+}
+
+def WMMARegular : WMMAType<0b00>;
+def WMMAUIClamp : WMMAType<0b01>;
+def WMMAOpSel : WMMAType<0b10>;
+
+class WMMARegularPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
+ GCNPat < (P.DstVT (node
+ (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)),
+ (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))
+ )),
+ (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, $src2_modifiers, P.Src2VT:$src2))
+>;
+
+class WMMAOpSelPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
+ GCNPat < (P.DstVT (node
+ (P.Src0VT P.Src0VT:$src0),
+ (P.Src1VT P.Src1VT:$src1),
+ (P.Src2VT P.Src2VT:$src2), (WMMAOpSelVOP3PMods i32:$src2_modifiers)
+ )),
+ (P.DstVT (Inst (i32 8), P.Src0VT:$src0, (i32 8), P.Src1VT:$src1, i32:$src2_modifiers, P.Src2VT:$src2))
+>;
+
+class WMMAUIClampPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
+ GCNPat < (P.DstVT (node
+ (DotIUVOP3PMods i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0),
+ (DotIUVOP3PMods i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1),
+ (P.Src2VT P.Src2VT:$src2), (i1 timm:$clamp)
+ )),
+ (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp))
+>;
+
+class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> {
+ Instruction Opcode2Addr = TwoAddr;
+ Instruction Opcode3Addr = ThreeAddr;
+ Predicate WaveSizePredicate;
+}
+
+def WMMAOpcode : GenericEnum {
+ let FilterClass = "VOP3P_Pseudo";
+}
+
+class WMMAMappingTable : GenericTable {
+ let FilterClass = "WMMAOpcodeMapping";
+ let CppTypeName = "WMMAOpcodeMappingInfo";
+ let Fields = ["Opcode2Addr", "Opcode3Addr"];
+ string TypeOf_Opcode2Addr = "WMMAOpcode";
+ string TypeOf_Opcode3Addr = "WMMAOpcode";
+}
+
+def WMMAOpcode2AddrMappingTable : WMMAMappingTable {
+ let PrimaryKey = ["Opcode2Addr"];
+ let PrimaryKeyName = "getWMMAMappingInfoFrom2AddrOpcode";
+}
+
+def WMMAOpcode3AddrMappingTable : WMMAMappingTable {
+ let PrimaryKey = ["Opcode3Addr"];
+ let PrimaryKeyName = "getWMMAMappingInfoFrom3AddrOpcode";
+}
+
+// The WMMA instruction has extra constraints:
+// Matrices A and B cannot overlap with D. C cannot partially overlap with D,
+// but it is OK for them to be the same (which is a typical case).
+//
+// We implement it as follows:
+// 1) Map the intrinsic to the pseudo where D is tied to C ($vdst = $src2).
+// 2) The pass twoaddressinstruction checks if src2 is live and if that is the case
+// it converts the default pseudo to the pseudo where src2 is not the same as vdst.
+// 3) @earlyclobber on the destination satisfies the constraint during RA.
+
+multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type> {
+
+ defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2";
+ defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
+
+ defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
+ if !eq(Suffix, "_w32") then {
+ let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+ let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
+ def _twoaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+ }
+ let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
+ def _threeaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+ }
+ }
+ def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w32),
+ !cast<Instruction>(NAME # _threeaddr_w32)>;
+ } else if !eq(Suffix, "_w64") then {
+ let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+ let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
+ def _twoaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+ }
+ let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
+ def _threeaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+ }
+ }
+ def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w64),
+ !cast<Instruction>(NAME # _threeaddr_w64)>;
+ }
+
+ if !eq(Type, WMMAOpSel) then {
+ def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+ } else if !eq(Type, WMMAUIClamp) then {
+ def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+ } else {
+ def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+ }
+}
+
+
+let WaveSizePredicate = isWave32 in {
+ defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
+ defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
+ defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
+ defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
+ defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
+ defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
+}
+
+let WaveSizePredicate = isWave64 in {
+ defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
+ defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
+ defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
+ defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
+ defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
+ defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
+
+}
+
//===----------------------------------------------------------------------===//
// Begin Real Encodings
//===----------------------------------------------------------------------===//
+class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget,
+ string opName = ps.OpName>
+ : VOP3P_DPP<op, opName, ps.Pfl, 1>, SIMCInstr<ps.PseudoInstr, subtarget> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+ let AssemblerPredicate = HasDPP16;
+ let SubtargetPredicate = HasDPP16;
+ let OtherPredicates = ps.OtherPredicates;
+}
+
+class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName>
+ : VOP3P_DPP8<op, opName, ps.Pfl> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+ let OtherPredicates = ps.OtherPredicates;
+}
+
+//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Plus,
+ DecoderNamespace = "GFX11" in {
+
+ multiclass VOP3P_Real_gfx11<bits<7> op, string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+ def _gfx11 : VOP3P_Real<!cast<VOP3P_Pseudo>(backing_ps_name),
+ SIEncodingFamily.GFX11, asmName>,
+ VOP3Pe_gfx11<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>;
+ }
+
+ multiclass VOP3P_Real_dpp_gfx11<bits<7> op, string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+ defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
+ def _dpp_gfx11
+ : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"),
+ SIEncodingFamily.GFX11> {
+ let AsmString = asmName #ps.Pfl.AsmVOP3DPP16;
+ let DecoderNamespace = "DPPGFX11";
+ }
+ }
+
+ multiclass VOP3P_Real_dpp8_gfx11<bits<7> op, string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+ defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
+ def _dpp8_gfx11 : VOP3P_DPP8_Base<op, ps> {
+ let AsmString = asmName #ps.Pfl.AsmVOP3DPP8;
+ let DecoderNamespace = "DPP8GFX11";
+ }
+ }
+
+ multiclass VOP3P_Realtriple_gfx11<bits<7> op, string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic>
+ : VOP3P_Real_gfx11<op, backing_ps_name, asmName>,
+ VOP3P_Real_dpp_gfx11<op, backing_ps_name, asmName>,
+ VOP3P_Real_dpp8_gfx11<op, backing_ps_name, asmName>;
+} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11"
+
+defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11 <0x16>;
+defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11 <0x18>;
+defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11 <0x1a>;
+
+multiclass VOP3P_Real_WMMA <bits<7> op> {
+ let WaveSizePredicate = isWave32, DecoderNamespace = "GFX11" in {
+ defm _twoaddr_w32 : VOP3P_Real_gfx11 <op>;
+ }
+ let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX11" in {
+ defm _twoaddr_w64 : VOP3P_Real_gfx11 <op>;
+ }
+}
+
+defm V_WMMA_F32_16X16X16_F16 : VOP3P_Real_WMMA <0x040>;
+defm V_WMMA_F32_16X16X16_BF16 : VOP3P_Real_WMMA <0x041>;
+defm V_WMMA_F16_16X16X16_F16 : VOP3P_Real_WMMA <0x042>;
+defm V_WMMA_BF16_16X16X16_BF16 : VOP3P_Real_WMMA <0x043>;
+defm V_WMMA_I32_16X16X16_IU8 : VOP3P_Real_WMMA <0x044>;
+defm V_WMMA_I32_16X16X16_IU4 : VOP3P_Real_WMMA <0x045>;
+
//===----------------------------------------------------------------------===//
// GFX8 (VI)
//===----------------------------------------------------------------------===//
@@ -557,15 +957,64 @@ multiclass VOP3P_Real_MFMA_gfx90a<bits<7> op> {
VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64").Pfl, 0>;
} // End AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A"
}
+}
+
+multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string Op,
+ VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(Op # "_e64"),
+ VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(Op # "_vgprcd" # "_e64"),
+ VOPProfile Pfl_ACD = PS_ACD.Pfl,
+ VOPProfile Pfl_VCD = PS_VCD.Pfl> {
+ let Predicates = [isGFX940Plus] in {
+ foreach _ = BoolToList<!ne(NameFrom, NameTo)>.ret in {
+ def : InstAlias <NameTo # " " # PS_ACD.AsmOperands,
+ (!cast<VOP3P_Real>(Op # "_gfx940_acd") Pfl_ACD.DstRC:$vdst,
+ Pfl_ACD.Src0RC64:$src0, Pfl_ACD.Src1RC64:$src1, Pfl_ACD.Src2RC64:$src2,
+ cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl;
+ def : InstAlias <NameTo # " " # PS_VCD.AsmOperands,
+ (!cast<VOP3P_Real>(Op # "_gfx940_vcd") Pfl_VCD.DstRC:$vdst,
+ Pfl_VCD.Src0RC64:$src0, Pfl_VCD.Src1RC64:$src1, Pfl_VCD.Src2RC64:$src2,
+ cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl;
+ }
+ } // End Predicates = [isGFX940Plus]
+}
+
+multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
+ VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"),
+ VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> {
+ let SubtargetPredicate = isGFX940Plus,
+ AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9",
+ AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in {
+ def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>,
+ VOP3Pe_MAI <op, PS_ACD.Pfl, 1>;
+
+ def _gfx940_vcd : VOP3P_Real<PS_VCD, SIEncodingFamily.GFX940>,
+ VOP3Pe_MAI <op, PS_VCD.Pfl, 0>;
+ } // End AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9"
-multiclass VOP3P_Real_MFMA<bits<7> op> :
- VOP3P_Real_MFMA_gfx90a <op> {
+ defm : VOP3P_Real_MFMA_gfx940_aliases<Name, PS_ACD.Mnemonic, NAME>;
+
+ foreach _ = BoolToList<!ne(!subst("_1k", "", PS_ACD.Mnemonic), PS_ACD.Mnemonic)>.ret in
+ defm : VOP3P_Real_MFMA_gfx940_aliases<Name, !subst("_1k", "", PS_ACD.Mnemonic), NAME>;
+}
+
+multiclass VOP3P_Real_MFMA<bits<7> op, string GFX940Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> :
+ VOP3P_Real_MFMA_gfx90a <op>,
+ VOP3P_Real_MFMA_gfx940 <op, GFX940Name> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> {
let AssemblerPredicate = HasMAIInsts;
let DecoderNamespace = "GFX8";
+ let Constraints = "";
}
}
+
+multiclass VOP3P_Real_SMFMAC<bits<7> op, string alias> {
+ def _gfx940 : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3Pe_SMFMAC <op> {
+ let AssemblerPredicate = isGFX940Plus;
+ let DecoderNamespace = "GFX8";
+ }
+ def : MnemonicAlias<alias, !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic>;
}
defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>;
@@ -634,19 +1083,21 @@ let SubtargetPredicate = HasMAIInsts in {
defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x58>;
defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x59>;
-defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MFMA <0x40>;
-defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MFMA <0x41>;
-defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MFMA <0x42>;
-defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MFMA <0x44>;
-defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MFMA <0x45>;
-defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MFMA <0x48>;
-defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MFMA <0x49>;
-defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MFMA <0x4a>;
-defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MFMA <0x4c>;
-defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MFMA <0x4d>;
-defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MFMA <0x50>;
-defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MFMA <0x51>;
-defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MFMA <0x52>;
+defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MFMA <0x40, "v_mfma_f32_32x32x1_2b_f32">;
+defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MFMA <0x41, "v_mfma_f32_16x16x1_4b_f32">;
+defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MFMA <0x42, "v_mfma_f32_4x4x1_16b_f32">;
+defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MFMA <0x44, "v_mfma_f32_32x32x2_f32">;
+defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MFMA <0x45, "v_mfma_f32_16x16x4_f32">;
+defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MFMA <0x48, "v_mfma_f32_32x32x4_2b_f16">;
+defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MFMA <0x49, "v_mfma_f32_16x16x4_4b_f16">;
+defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MFMA <0x4a, "v_mfma_f32_4x4x4_16b_f16">;
+defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MFMA <0x4c, "v_mfma_f32_32x32x8_f16">;
+defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MFMA <0x4d, "v_mfma_f32_16x16x16_f16">;
+defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MFMA <0x50, "v_mfma_i32_32x32x4_2b_i8">;
+defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MFMA <0x51, "v_mfma_i32_16x16x4_4b_i8">;
+defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MFMA <0x52, "v_mfma_i32_4x4x4_16b_i8">;
+
+let SubtargetPredicate = isGFX908orGFX90A in {
defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MFMA <0x55>;
defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MFMA <0x54>;
defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MFMA <0x68>;
@@ -654,6 +1105,7 @@ defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MFMA <0x69>;
defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MFMA <0x6b>;
defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA <0x6c>;
defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA <0x6d>;
+}
} // End SubtargetPredicate = HasMAIInsts
@@ -665,6 +1117,27 @@ defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>;
defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx90a <0x6e>;
defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>;
+defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
+defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
+defm V_MFMA_F32_16X16X8XF32 : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">;
+defm V_MFMA_F32_32X32X4XF32 : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">;
+
+defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5d, "v_mfma_f32_32x32x4_2b_bf16">;
+defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5e, "v_mfma_f32_16x16x4_4b_bf16">;
+defm V_MFMA_F32_4X4X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5f, "v_mfma_f32_4x4x4_16b_bf16">;
+defm V_MFMA_F32_32X32X8BF16_1K : VOP3P_Real_MFMA_gfx940 <0x60, "v_mfma_f32_32x32x8_bf16">;
+defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx940 <0x61, "v_mfma_f32_16x16x16_bf16">;
+
+defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx940 <0x6e, "v_mfma_f64_16x16x4_f64">;
+defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx940 <0x6f, "v_mfma_f64_4x4x4_4b_f64">;
+
+defm V_SMFMAC_F32_16X16X32_F16 : VOP3P_Real_SMFMAC <0x62, "v_smfmac_f32_16x16x32f16">;
+defm V_SMFMAC_F32_32X32X16_F16 : VOP3P_Real_SMFMAC <0x64, "v_smfmac_f32_32x32x16f16">;
+defm V_SMFMAC_F32_16X16X32_BF16 : VOP3P_Real_SMFMAC <0x66, "v_smfmac_f32_16x16x32bf16">;
+defm V_SMFMAC_F32_32X32X16_BF16 : VOP3P_Real_SMFMAC <0x68, "v_smfmac_f32_32x32x16bf16">;
+defm V_SMFMAC_I32_16X16X64_I8 : VOP3P_Real_SMFMAC <0x6a, "v_smfmac_i32_16x16x64i8">;
+defm V_SMFMAC_I32_32X32X32_I8 : VOP3P_Real_SMFMAC <0x6c, "v_smfmac_i32_32x32x32i8">;
+
let SubtargetPredicate = HasPackedFP32Ops in {
defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;
@@ -676,35 +1149,41 @@ let SubtargetPredicate = HasPackedFP32Ops in {
// GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 in {
multiclass VOP3P_Real_gfx10<bits<7> op> {
def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>,
VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>;
}
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1
+
+multiclass VOP3P_Real_gfx10_gfx11<bits<7> op>
+ : VOP3P_Real_gfx10<op>, VOP3P_Real_gfx11<op>;
+
+multiclass VOP3P_Real_gfx10_gfx11_Triple<bits<7> op>
+ : VOP3P_Real_gfx10<op>, VOP3P_Realtriple_gfx11<op>;
-defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x00>;
-defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x01>;
-defm V_PK_ADD_I16 : VOP3P_Real_gfx10<0x02>;
-defm V_PK_SUB_I16 : VOP3P_Real_gfx10<0x03>;
-defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x04>;
-defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x05>;
-defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x06>;
-defm V_PK_MAX_I16 : VOP3P_Real_gfx10<0x07>;
-defm V_PK_MIN_I16 : VOP3P_Real_gfx10<0x08>;
-defm V_PK_MAD_U16 : VOP3P_Real_gfx10<0x09>;
-defm V_PK_ADD_U16 : VOP3P_Real_gfx10<0x0a>;
-defm V_PK_SUB_U16 : VOP3P_Real_gfx10<0x0b>;
-defm V_PK_MAX_U16 : VOP3P_Real_gfx10<0x0c>;
-defm V_PK_MIN_U16 : VOP3P_Real_gfx10<0x0d>;
-defm V_PK_FMA_F16 : VOP3P_Real_gfx10<0x0e>;
-defm V_PK_ADD_F16 : VOP3P_Real_gfx10<0x0f>;
-defm V_PK_MUL_F16 : VOP3P_Real_gfx10<0x10>;
-defm V_PK_MIN_F16 : VOP3P_Real_gfx10<0x11>;
-defm V_PK_MAX_F16 : VOP3P_Real_gfx10<0x12>;
-defm V_FMA_MIX_F32 : VOP3P_Real_gfx10<0x20>;
-defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10<0x21>;
-defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x22>;
+defm V_PK_MAD_I16 : VOP3P_Real_gfx10_gfx11<0x00>;
+defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10_gfx11<0x01>;
+defm V_PK_ADD_I16 : VOP3P_Real_gfx10_gfx11<0x02>;
+defm V_PK_SUB_I16 : VOP3P_Real_gfx10_gfx11<0x03>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10_gfx11<0x04>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10_gfx11<0x05>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10_gfx11<0x06>;
+defm V_PK_MAX_I16 : VOP3P_Real_gfx10_gfx11<0x07>;
+defm V_PK_MIN_I16 : VOP3P_Real_gfx10_gfx11<0x08>;
+defm V_PK_MAD_U16 : VOP3P_Real_gfx10_gfx11<0x09>;
+defm V_PK_ADD_U16 : VOP3P_Real_gfx10_gfx11<0x0a>;
+defm V_PK_SUB_U16 : VOP3P_Real_gfx10_gfx11<0x0b>;
+defm V_PK_MAX_U16 : VOP3P_Real_gfx10_gfx11<0x0c>;
+defm V_PK_MIN_U16 : VOP3P_Real_gfx10_gfx11<0x0d>;
+defm V_PK_FMA_F16 : VOP3P_Real_gfx10_gfx11<0x0e>;
+defm V_PK_ADD_F16 : VOP3P_Real_gfx10_gfx11<0x0f>;
+defm V_PK_MUL_F16 : VOP3P_Real_gfx10_gfx11<0x10>;
+defm V_PK_MIN_F16 : VOP3P_Real_gfx10_gfx11<0x11>;
+defm V_PK_MAX_F16 : VOP3P_Real_gfx10_gfx11<0x12>;
+defm V_FMA_MIX_F32 : VOP3P_Real_gfx10_gfx11_Triple <0x20>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x21>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x22>;
let SubtargetPredicate = HasDot2Insts in {
@@ -715,9 +1194,9 @@ defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>;
let SubtargetPredicate = HasDot7Insts in {
-defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>;
-defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x17>;
-defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x19>;
+defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x13>;
+defm V_DOT4_U32_U8 : VOP3P_Real_gfx10_gfx11 <0x17>;
+defm V_DOT8_U32_U4 : VOP3P_Real_gfx10_gfx11 <0x19>;
} // End SubtargetPredicate = HasDot7Insts
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index c0cc91029d11..eb6c54a45263 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -49,12 +49,36 @@ class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
// an explicit $dst.
class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt0> :
VOPProfile <[i1, vt0, vt1, untyped]> {
+ // We want to exclude instructions with 64bit operands
+ let HasExtDPP = getHasVOP3DPP<DstVT, Src0VT, Src1VT, Src2VT>.ret;
let Asm32 = "$src0, $src1";
+
+ let AsmDPP = !if (HasModifiers,
+ "$src0_modifiers, $src1_modifiers "
+ "$dpp_ctrl$row_mask$bank_mask$bound_ctrl",
+ "$src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl");
+ let AsmDPP8 = "$src0, $src1 $dpp8$fi";
+ let AsmDPP16 = AsmDPP#"$fi";
+ let InsDPP = getInsDPP<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
+ NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
+ Src2ModDPP>.ret;
+ let InsDPP16 = getInsDPP16<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
+ NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
+ Src2ModDPP>.ret;
+ let InsDPP8 = getInsDPP8<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
+ NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
+ Src2ModDPP>.ret;
+
// The destination for 32-bit encoding is implicit.
let HasDst32 = 0;
// VOPC disallows dst_sel and dst_unused as they have no effect on destination
let EmitDstSel = 0;
let Outs64 = (outs VOPDstS64orS32:$sdst);
+ let OutsVOP3DPP = Outs64;
+ let OutsVOP3DPP8 = Outs64;
+ let InsVOP3DPP = getInsVOP3DPP<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret;
+ let InsVOP3DPP16 = getInsVOP3DPP16<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret;
+ let InsVOP3DPP8 = getInsVOP3DPP8<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret;
list<SchedReadWrite> Schedule = sched;
}
@@ -62,12 +86,15 @@ class VOPC_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt0,
ValueType vt1 = vt0> :
VOPC_Profile<sched, vt0, vt1> {
let Outs64 = (outs );
+ let OutsVOP3DPP = Outs64;
+ let OutsVOP3DPP8 = Outs64;
let OutsSDWA = (outs );
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
src0_sel:$src0_sel, src1_sel:$src1_sel);
let Asm64 = !if(isFloatType<Src0VT>.ret, "$src0_modifiers, $src1_modifiers$clamp",
"$src0, $src1");
+ let AsmVOP3DPPBase = Asm64;
let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
let EmitDst = 0;
}
@@ -100,8 +127,8 @@ class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[],
VOPProfile Pfl = P;
}
-class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
- InstSI <ps.OutOperandList, ps.InOperandList, ps.PseudoInstr # " " # ps.AsmOperands, []>,
+class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily, string asm_name = ps.PseudoInstr> :
+ InstSI <ps.OutOperandList, ps.InOperandList, asm_name # " " # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
let VALU = 1;
@@ -133,8 +160,9 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
// This class is used only with VOPC instructions. Use $sdst for out operand
class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst,
- string Asm32 = ps.Pfl.Asm32, VOPProfile p = ps.Pfl> :
- InstAlias <ps.OpName#" "#Asm32, (inst)>, PredicateControl {
+ string Asm32 = ps.Pfl.Asm32, string real_name = ps.OpName,
+ VOPProfile p = ps.Pfl> :
+ InstAlias <real_name#" "#Asm32, (inst)>, PredicateControl {
field bit isCompare;
field bit isCommutable;
@@ -167,27 +195,32 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst,
let SubtargetPredicate = AssemblerPredicate;
}
-multiclass VOPCInstAliases <string OpName, string Arch> {
- def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
- !cast<Instruction>(OpName#"_e32_"#Arch)>;
+multiclass VOPCInstAliases <string old_name, string Arch, string real_name = old_name> {
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(old_name#"_e64"),
+ !cast<Instruction>(real_name#"_e32_"#Arch),
+ !cast<VOP3_Pseudo>(old_name#"_e64").Pfl.Asm32,
+ real_name>;
let WaveSizePredicate = isWave32 in {
- def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
- !cast<Instruction>(OpName#"_e32_"#Arch),
- "vcc_lo, "#!cast<VOP3_Pseudo>(OpName#"_e64").Pfl.Asm32>;
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(old_name#"_e64"),
+ !cast<Instruction>(real_name#"_e32_"#Arch),
+ "vcc_lo, "#!cast<VOP3_Pseudo>(old_name#"_e64").Pfl.Asm32,
+ real_name>;
}
let WaveSizePredicate = isWave64 in {
- def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
- !cast<Instruction>(OpName#"_e32_"#Arch),
- "vcc, "#!cast<VOP3_Pseudo>(OpName#"_e64").Pfl.Asm32>;
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(old_name#"_e64"),
+ !cast<Instruction>(real_name#"_e32_"#Arch),
+ "vcc, "#!cast<VOP3_Pseudo>(old_name#"_e64").Pfl.Asm32,
+ real_name>;
}
}
-multiclass VOPCXInstAliases <string OpName, string Arch> {
- def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
- !cast<Instruction>(OpName#"_e32_"#Arch)>;
+multiclass VOPCXInstAliases <string old_name, string Arch, string real_name = old_name> {
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(old_name#"_e64"),
+ !cast<Instruction>(real_name#"_e32_"#Arch),
+ !cast<VOP3_Pseudo>(old_name#"_e64").Pfl.Asm32,
+ real_name>;
}
-
class getVOPCPat64 <SDPatternOperator cond, VOPProfile P> : LetDummies {
list<dag> ret = !if(P.HasModifiers,
[(set i1:$sdst,
@@ -205,6 +238,11 @@ class VCMPXNoSDstTable <bit has_sdst, string Name> {
string NoSDstOp = Name;
}
+class VCMPVCMPXTable <string Name> {
+ bit IsVCMPX = 0;
+ string VCMPOp = Name;
+}
+
multiclass VOPC_Pseudos <string opName,
VOPC_Profile P,
SDPatternOperator cond = COND_NULL,
@@ -213,7 +251,8 @@ multiclass VOPC_Pseudos <string opName,
def _e32 : VOPC_Pseudo <opName, P>,
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>,
- VCMPXNoSDstTable<1, opName#"_e32"> {
+ VCMPXNoSDstTable<1, opName#"_e32">,
+ VCMPVCMPXTable<opName#"_e32"> {
let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
let SchedRW = P.Schedule;
let isConvergent = DefExec;
@@ -223,7 +262,8 @@ multiclass VOPC_Pseudos <string opName,
def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>,
- VCMPXNoSDstTable<1, opName#"_e64"> {
+ VCMPXNoSDstTable<1, opName#"_e64">,
+ VCMPVCMPXTable<opName#"_e64"> {
let Defs = !if(DefExec, [EXEC], []);
let SchedRW = P.Schedule;
let isCompare = 1;
@@ -237,6 +277,26 @@ multiclass VOPC_Pseudos <string opName,
let isConvergent = DefExec;
let isCompare = 1;
}
+
+ let SubtargetPredicate = isGFX11Plus in {
+ if P.HasExtDPP then
+ def _e32_dpp : VOP_DPP_Pseudo<opName, P> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let SchedRW = P.Schedule;
+ let isConvergent = DefExec;
+ let isCompare = 1;
+ let VOPC = 1;
+ let Constraints = "";
+ }
+ if P.HasExtVOP3DPP then
+ def _e64_dpp : VOP3_DPP_Pseudo<opName, P> {
+ let Defs = !if(DefExec, [EXEC], []);
+ let SchedRW = P.Schedule;
+ let isCompare = 1;
+ let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $sdst", "");
+ }
+ } // end SubtargetPredicate = isGFX11Plus
+
}
let SubtargetPredicate = HasSdstCMPX in {
@@ -248,23 +308,27 @@ multiclass VOPCX_Pseudos <string opName,
def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>,
Commutable_REV<revOp#"_nosdst_e32", !eq(revOp, opName)>,
- VCMPXNoSDstTable<0, opName#"_e32"> {
+ VCMPXNoSDstTable<0, opName#"_e32">,
+ VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e32")> {
let Defs = [EXEC];
let SchedRW = P_NoSDst.Schedule;
let isConvergent = 1;
let isCompare = 1;
let isCommutable = 1;
let SubtargetPredicate = HasNoSdstCMPX;
+ let IsVCMPX = 1;
}
def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>,
Commutable_REV<revOp#"_nosdst_e64", !eq(revOp, opName)>,
- VCMPXNoSDstTable<0, opName#"_e64"> {
+ VCMPXNoSDstTable<0, opName#"_e64">,
+ VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e64")> {
let Defs = [EXEC];
let SchedRW = P_NoSDst.Schedule;
let isCompare = 1;
let isCommutable = 1;
let SubtargetPredicate = HasNoSdstCMPX;
+ let IsVCMPX = 1;
}
foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
@@ -275,6 +339,25 @@ multiclass VOPCX_Pseudos <string opName,
let isCompare = 1;
let SubtargetPredicate = HasNoSdstCMPX;
}
+
+ let SubtargetPredicate = isGFX11Plus in {
+ if P.HasExtDPP then
+ def _nosdst_e32_dpp : VOP_DPP_Pseudo<opName#"_nosdst", P_NoSDst> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let isConvergent = 1;
+ let isCompare = 1;
+ let VOPC = 1;
+ let Constraints = "";
+ }
+ if P.HasExtVOP3DPP then
+ def _nosdst_e64_dpp : VOP3_DPP_Pseudo<opName#"_nosdst", P_NoSDst> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let isCompare = 1;
+ let Constraints = "";
+ }
+ } // end SubtargetPredicate = isGFX11Plus
}
} // End SubtargetPredicate = HasSdstCMPX
@@ -626,8 +709,18 @@ defm V_CMPX_T_U64 : VOPCX_I64 <"v_cmpx_t_u64">;
class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
VOPC_Profile<sched, vt, i32> {
+ let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+ let AsmDPP16 = AsmDPP#"$fi";
+ let InsDPP = (ins VGPR_32:$old, FPVRegInputMods:$src0_modifiers, VGPR_32:$src0, VGPR_32:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+ // DPP8 forbids modifiers and can inherit from VOPC_Profile
+
let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+ dag InsPartVOP3DPP = (ins Src0Mod:$src0_modifiers, VGPRSrc_32:$src0, VGPRSrc_32:$src1);
+ let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel),
+ (ins)));
let Asm64 = "$sdst, $src0_modifiers, $src1";
+ let AsmVOP3DPPBase = Asm64;
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
@@ -647,6 +740,7 @@ class VOPC_Class_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt> :
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
src0_sel:$src0_sel, src1_sel:$src1_sel);
let Asm64 = "$src0_modifiers, $src1";
+ let AsmVOP3DPPBase = Asm64;
let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
let EmitDst = 0;
}
@@ -684,6 +778,24 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
let SchedRW = p.Schedule;
let isConvergent = DefExec;
}
+
+ let SubtargetPredicate = isGFX11Plus in {
+ if p.HasExtDPP then
+ def _e32_dpp : VOP_DPP_Pseudo<opName, p> {
+ let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]),
+ !if(DefVcc, [VCC], []));
+ let SchedRW = p.Schedule;
+ let isConvergent = DefExec;
+ let VOPC = 1;
+ let Constraints = "";
+ }
+ if p.HasExtVOP3DPP then
+ def _e64_dpp : VOP3_DPP_Pseudo<opName, p> {
+ let Defs = !if(DefExec, [EXEC], []);
+ let SchedRW = p.Schedule;
+ let Constraints = !if(p.NumSrcArgs, p.TieRegDPP # " = $sdst", "");
+ }
+ } // end SubtargetPredicate = isGFX11Plus
}
let SubtargetPredicate = HasSdstCMPX in {
@@ -714,6 +826,23 @@ multiclass VOPCX_Class_Pseudos <string opName,
let isConvergent = 1;
let SubtargetPredicate = HasNoSdstCMPX;
}
+
+ let SubtargetPredicate = isGFX11Plus in {
+ if P.HasExtDPP then
+ def _nosdst_e32_dpp : VOP_DPP_Pseudo<opName#"_nosdst", P_NoSDst> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let isConvergent = 1;
+ let VOPC = 1;
+ let Constraints = "";
+ }
+ if P.HasExtVOP3DPP then
+ def _nosdst_e64_dpp : VOP3_DPP_Pseudo<opName#"_nosdst", P_NoSDst> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let Constraints = "";
+ }
+ } // end SubtargetPredicate = isGFX11Plus
}
} // End SubtargetPredicate = HasSdstCMPX
@@ -872,14 +1001,676 @@ defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>;
defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>;
//===----------------------------------------------------------------------===//
+// DPP Encodings
+//===----------------------------------------------------------------------===//
+
+// VOPC32
+
+class VOPC_DPPe_Common<bits<8> op> : Enc64 {
+ bits<8> src1;
+ let Inst{16-9} = src1;
+ let Inst{24-17} = op;
+ let Inst{31-25} = 0x3e;
+}
+
+class VOPC_DPP_Base<bits<8> op, string OpName, VOPProfile P>
+ : VOP_DPP_Base<OpName, P, P.InsDPP16, " " #P.AsmDPP16>,
+ VOPC_DPPe_Common<op> {
+ bits<2> src0_modifiers;
+ bits<8> src0;
+ bits<2> src1_modifiers;
+ bits<9> dpp_ctrl;
+ bits<1> bound_ctrl;
+ bits<4> bank_mask;
+ bits<4> row_mask;
+ bit fi;
+
+ let Inst{8-0} = 0xfa;
+
+ let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, 0);
+ let Inst{48-40} = dpp_ctrl;
+ let Inst{50} = fi;
+ let Inst{51} = bound_ctrl;
+ let Inst{52} = !if (P.HasSrc0Mods, src0_modifiers{0}, 0); // src0_neg
+ let Inst{53} = !if (P.HasSrc0Mods, src0_modifiers{1}, 0); // src0_abs
+ let Inst{54} = !if (P.HasSrc1Mods, src1_modifiers{0}, 0); // src1_neg
+ let Inst{55} = !if (P.HasSrc1Mods, src1_modifiers{1}, 0); // src1_abs
+ let Inst{59-56} = bank_mask;
+ let Inst{63-60} = row_mask;
+
+ let AsmMatchConverter = "cvtDPP";
+ let VOPC = 1;
+}
+
+class VOPC_DPP8_Base<bits<8> op, string OpName, VOPProfile P>
+ : VOP_DPP8_Base<OpName, P, P.InsDPP8, " " #P.AsmDPP8>,
+ VOPC_DPPe_Common<op> {
+ bits<8> src0;
+ bits<24> dpp8;
+ bits<9> fi;
+
+ let Inst{8-0} = fi;
+
+ let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, 0);
+ let Inst{63-40} = dpp8{23-0};
+
+ let AsmMatchConverter = "cvtDPP8";
+ let VOPC = 1;
+}
+
+class VOPC_DPP16<bits<8> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
+ : VOPC_DPP_Base<op, opName, ps.Pfl> {
+ let AssemblerPredicate = HasDPP16;
+ let SubtargetPredicate = HasDPP16;
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+ let OtherPredicates = ps.OtherPredicates;
+ let Constraints = ps.Constraints;
+ let AsmMatchConverter = "cvtVOPCNoDstDPP";
+}
+
+class VOPC_DPP16_SIMC<bits<8> op, VOP_DPP_Pseudo ps, int subtarget,
+ string opName = ps.OpName>
+ : VOPC_DPP16<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
+
+class VOPC_DPP8<bits<8> op, VOPC_Pseudo ps, string opName = ps.OpName>
+ : VOPC_DPP8_Base<op, opName, ps.Pfl> {
+ // Note ps is the non-dpp pseudo
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+ let OtherPredicates = ps.OtherPredicates;
+ let Constraints = "";
+ let AsmMatchConverter = "cvtVOPCNoDstDPP8";
+}
+
+// VOPC64
+
+class VOPC64_DPP_Base<bits<10> op, string OpName, VOPProfile P>
+ : VOP3_DPP_Base<OpName, P, 1>, VOP3_DPPe_Common<op, P> {
+ Instruction Opcode = !cast<Instruction>(NAME);
+
+ bits<8> src0;
+ bits<9> dpp_ctrl;
+ bits<1> bound_ctrl;
+ bits<4> bank_mask;
+ bits<4> row_mask;
+ bit fi;
+
+ let Inst{40-32} = 0xfa;
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{80-72} = dpp_ctrl;
+ let Inst{82} = fi;
+ let Inst{83} = bound_ctrl;
+ // Inst{87-84} ignored by hw
+ let Inst{91-88} = bank_mask;
+ let Inst{95-92} = row_mask;
+
+}
+
+class VOPC64_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
+ : VOPC64_DPP_Base<op, opName, ps.Pfl> {
+ let AssemblerPredicate = HasDPP16;
+ let SubtargetPredicate = HasDPP16;
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+ let OtherPredicates = ps.OtherPredicates;
+ let Constraints = ps.Constraints;
+}
+
+class VOPC64_DPP16_Dst<bits<10> op, VOP_DPP_Pseudo ps,
+ string opName = ps.OpName>
+ : VOPC64_DPP16<op, ps, opName> {
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+}
+
+class VOPC64_DPP16_NoDst<bits<10> op, VOP_DPP_Pseudo ps,
+ string opName = ps.OpName>
+ : VOPC64_DPP16<op, ps, opName> {
+ let Inst{7-0} = ? ;
+ let AsmMatchConverter = "cvtVOPC64NoDstDPP";
+}
+
+class VOPC64_DPP8_Base<bits<10> op, string OpName, VOPProfile P>
+ : VOP3_DPP8_Base<OpName, P>, VOP3_DPPe_Common<op, P> {
+ Instruction Opcode = !cast<Instruction>(NAME);
+
+ bits<8> src0;
+ bits<24> dpp8;
+ bits<9> fi;
+
+ let Inst{40-32} = fi;
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{95-72} = dpp8{23-0};
+
+}
+
+class VOPC64_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+ : VOPC64_DPP8_Base<op, opName, ps.Pfl> {
+ // Note ps is the non-dpp pseudo
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+ let OtherPredicates = ps.OtherPredicates;
+}
+
+class VOPC64_DPP8_Dst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+ : VOPC64_DPP8<op, ps, opName> {
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ let Constraints = "$old = $sdst";
+}
+
+class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+ : VOPC64_DPP8<op, ps, opName> {
+ let Inst{7-0} = ? ;
+ let AsmMatchConverter = "cvtVOPC64NoDstDPP8";
+ let Constraints = "";
+}
+
+//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Only in {
+ multiclass VOPC_Real_gfx11<bits<9> op> {
+ defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_e32");
+ defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_e64");
+ let DecoderNamespace = "GFX11" in {
+ def _e32_gfx11 : VOPC_Real<ps32, SIEncodingFamily.GFX11>,
+ VOPCe<op{7-0}>;
+ def _e64_gfx11 : VOP3_Real<ps64, SIEncodingFamily.GFX11>,
+ VOP3a_gfx11<{0, op}, ps64.Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
+ } // End DecoderNamespace = "GFX11"
+
+ defm : VOPCInstAliases<NAME, "gfx11">;
+
+ foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in {
+ defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e32" #"_dpp");
+ defvar AsmDPP = ps32.Pfl.AsmDPP16;
+ let DecoderNamespace = "DPPGFX11" in {
+ def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+ SIEncodingFamily.GFX11>;
+ def _e32_dpp_w32_gfx11 : VOPC_DPP16<op{7-0}, psDPP> {
+ let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e32_dpp_w64_gfx11 : VOPC_DPP16<op{7-0}, psDPP> {
+ let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
+ }
+ defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
+ let DecoderNamespace = "DPP8GFX11" in {
+ def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32>;
+ def _e32_dpp8_w32_gfx11 : VOPC_DPP8<op{7-0}, ps32> {
+ let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e32_dpp8_w64_gfx11 : VOPC_DPP8<op{7-0}, ps32> {
+ let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
+ }
+ }
+ foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in {
+ defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp");
+ defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
+ let DecoderNamespace = "DPPGFX11" in {
+ def _e64_dpp_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP>,
+ SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11>;
+ def _e64_dpp_w32_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+ let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e64_dpp_w64_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+ let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
+ }
+ defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
+ let DecoderNamespace = "DPP8GFX11" in {
+ def _e64_dpp8_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64>;
+ def _e64_dpp8_w32_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64> {
+ let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e64_dpp8_w64_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64> {
+ let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
+ }
+ }
+
+ }
+
+ multiclass VOPC_Real_with_name_gfx11<bits<9> op, string OpName,
+ string asm_name> {
+ defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_e32");
+ defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_e64");
+ let DecoderNamespace = "GFX11" in {
+ def _e32_gfx11 :
+ // 32 and 64 bit forms of the instruction have _e32 and _e64
+ // respectively appended to their assembly mnemonic.
+ // _e64 is printed as part of the VOPDstS64orS32 operand, whereas
+ // the destination-less 32bit forms add it to the asmString here.
+ VOPC_Real<ps32, SIEncodingFamily.GFX11, asm_name#"_e32">,
+ VOPCe<op{7-0}>,
+ MnemonicAlias<ps32.Mnemonic, asm_name>, Requires<[isGFX11Plus]>;
+ def _e64_gfx11 :
+ VOP3_Real<ps64, SIEncodingFamily.GFX11, asm_name>,
+ VOP3a_gfx11<{0, op}, ps64.Pfl>,
+ MnemonicAlias<ps64.Mnemonic, asm_name>, Requires<[isGFX11Plus]> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
+ } // End DecoderNamespace = "GFX11"
+
+ defm : VOPCInstAliases<OpName, "gfx11", NAME>;
+
+ foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in {
+ defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e32" #"_dpp");
+ defvar AsmDPP = ps32.Pfl.AsmDPP16;
+ let DecoderNamespace = "DPPGFX11" in {
+ def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+ SIEncodingFamily.GFX11, asm_name>;
+ def _e32_dpp_w32_gfx11
+ : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
+ let AsmString = asm_name # " vcc_lo, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e32_dpp_w64_gfx11
+ : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
+ let AsmString = asm_name # " vcc, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
+ }
+ defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
+ let DecoderNamespace = "DPP8GFX11" in {
+ def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32, asm_name>;
+ def _e32_dpp8_w32_gfx11
+ : VOPC_DPP8<op{7-0}, ps32, asm_name> {
+ let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e32_dpp8_w64_gfx11
+ : VOPC_DPP8<op{7-0}, ps32, asm_name> {
+ let AsmString = asm_name # " vcc, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
+ }
+ }
+
+ foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in {
+ defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp");
+ defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
+ let DecoderNamespace = "DPPGFX11" in {
+ def _e64_dpp_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
+ SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11>;
+ def _e64_dpp_w32_gfx11
+ : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
+ let AsmString = asm_name # " vcc_lo, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e64_dpp_w64_gfx11
+ : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
+ let AsmString = asm_name # " vcc, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
+ }
+ defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
+ let DecoderNamespace = "DPP8GFX11" in {
+ def _e64_dpp8_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
+ def _e64_dpp8_w32_gfx11
+ : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
+ let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e64_dpp8_w64_gfx11
+ : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
+ let AsmString = asm_name # " vcc, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
+ }
+ }
+
+ }
+
+ multiclass VOPCX_Real_gfx11<bits<9> op> {
+ defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_nosdst_e32");
+ defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_nosdst_e64");
+ let DecoderNamespace = "GFX11" in {
+ def _e32_gfx11 :
+ VOPC_Real<ps32, SIEncodingFamily.GFX11>,
+ VOPCe<op{7-0}> {
+ let AsmString = !subst("_nosdst", "", ps32.PseudoInstr)
+ # " " # ps32.AsmOperands;
+ }
+ def _e64_gfx11 :
+ VOP3_Real<ps64, SIEncodingFamily.GFX11>,
+ VOP3a_gfx11<{0, op}, ps64.Pfl> {
+ let Inst{7-0} = ?; // sdst
+ let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
+ # "{_e64} " # ps64.AsmOperands;
+ }
+ } // End DecoderNamespace = "GFX11"
+
+ defm : VOPCXInstAliases<NAME, "gfx11">;
+
+ foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in {
+ defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e32" #"_dpp");
+ defvar AsmDPP = ps32.Pfl.AsmDPP16;
+ let DecoderNamespace = "DPPGFX11" in {
+ def _e32_dpp_gfx11
+ : VOPC_DPP16_SIMC<op{7-0}, psDPP, SIEncodingFamily.GFX11> {
+ let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP;
+ }
+ }
+ defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
+ let DecoderNamespace = "DPP8GFX11" in {
+ def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32> {
+ let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8;
+ }
+ }
+ }
+
+ foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in {
+ defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e64" #"_dpp");
+ defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
+ let DecoderNamespace = "DPPGFX11" in {
+ def _e64_dpp_gfx11
+ : VOPC64_DPP16_NoDst<{0, op}, psDPP>,
+ SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11> {
+ let AsmString = !subst("_nosdst", "", psDPP.OpName)
+ # "{_e64_dpp} " # AsmDPP;
+ }
+ }
+ defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
+ let DecoderNamespace = "DPP8GFX11" in {
+ def _e64_dpp8_gfx11 : VOPC64_DPP8_NoDst<{0, op}, ps64> {
+ let AsmString = !subst("_nosdst", "", ps64.OpName)
+ # "{_e64_dpp} " # AsmDPP8;
+ }
+ }
+ }
+ }
+
+ multiclass VOPCX_Real_with_name_gfx11<bits<9> op, string OpName,
+ string asm_name> {
+ defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_nosdst_e32");
+ defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_nosdst_e64");
+ let DecoderNamespace = "GFX11" in {
+ def _e32_gfx11
+ : VOPC_Real<ps32, SIEncodingFamily.GFX11, asm_name>,
+ MnemonicAlias<!subst("_nosdst", "", ps32.Mnemonic), asm_name>,
+ Requires<[isGFX11Plus]>,
+ VOPCe<op{7-0}> {
+ let AsmString = asm_name # "{_e32} " # ps32.AsmOperands;
+ }
+ def _e64_gfx11
+ : VOP3_Real<ps64, SIEncodingFamily.GFX11, asm_name>,
+ MnemonicAlias<!subst("_nosdst", "", ps64.Mnemonic), asm_name>,
+ Requires<[isGFX11Plus]>,
+ VOP3a_gfx11<{0, op}, ps64.Pfl> {
+ let Inst{7-0} = ? ; // sdst
+ let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+ }
+ } // End DecoderNamespace = "GFX11"
+
+ defm : VOPCXInstAliases<OpName, "gfx11", NAME>;
+
+ foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in {
+ defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e32"#"_dpp");
+ let DecoderNamespace = "DPPGFX11" in {
+ def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+ SIEncodingFamily.GFX11, asm_name>;
+ }
+ let DecoderNamespace = "DPP8GFX11" in {
+ def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32, asm_name>;
+ }
+ }
+ foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in {
+ defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp");
+ defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
+ let DecoderNamespace = "DPPGFX11" in {
+ def _e64_dpp_gfx11
+ : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>,
+ SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11> {
+ let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
+ }
+ }
+ defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
+ let DecoderNamespace = "DPP8GFX11" in {
+ def _e64_dpp8_gfx11 : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
+ let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
+ }
+ }
+ }
+
+ }
+} // End AssemblerPredicate = isGFX11Only
+
+defm V_CMP_F_F16 : VOPC_Real_gfx11<0x000>;
+defm V_CMP_LT_F16 : VOPC_Real_gfx11<0x001>;
+defm V_CMP_EQ_F16 : VOPC_Real_gfx11<0x002>;
+defm V_CMP_LE_F16 : VOPC_Real_gfx11<0x003>;
+defm V_CMP_GT_F16 : VOPC_Real_gfx11<0x004>;
+defm V_CMP_LG_F16 : VOPC_Real_gfx11<0x005>;
+defm V_CMP_GE_F16 : VOPC_Real_gfx11<0x006>;
+defm V_CMP_O_F16 : VOPC_Real_gfx11<0x007>;
+defm V_CMP_U_F16 : VOPC_Real_gfx11<0x008>;
+defm V_CMP_NGE_F16 : VOPC_Real_gfx11<0x009>;
+defm V_CMP_NLG_F16 : VOPC_Real_gfx11<0x00a>;
+defm V_CMP_NGT_F16 : VOPC_Real_gfx11<0x00b>;
+defm V_CMP_NLE_F16 : VOPC_Real_gfx11<0x00c>;
+defm V_CMP_NEQ_F16 : VOPC_Real_gfx11<0x00d>;
+defm V_CMP_NLT_F16 : VOPC_Real_gfx11<0x00e>;
+defm V_CMP_T_F16 : VOPC_Real_with_name_gfx11<0x00f, "V_CMP_TRU_F16", "v_cmp_t_f16">;
+defm V_CMP_F_F32 : VOPC_Real_gfx11<0x010>;
+defm V_CMP_LT_F32 : VOPC_Real_gfx11<0x011>;
+defm V_CMP_EQ_F32 : VOPC_Real_gfx11<0x012>;
+defm V_CMP_LE_F32 : VOPC_Real_gfx11<0x013>;
+defm V_CMP_GT_F32 : VOPC_Real_gfx11<0x014>;
+defm V_CMP_LG_F32 : VOPC_Real_gfx11<0x015>;
+defm V_CMP_GE_F32 : VOPC_Real_gfx11<0x016>;
+defm V_CMP_O_F32 : VOPC_Real_gfx11<0x017>;
+defm V_CMP_U_F32 : VOPC_Real_gfx11<0x018>;
+defm V_CMP_NGE_F32 : VOPC_Real_gfx11<0x019>;
+defm V_CMP_NLG_F32 : VOPC_Real_gfx11<0x01a>;
+defm V_CMP_NGT_F32 : VOPC_Real_gfx11<0x01b>;
+defm V_CMP_NLE_F32 : VOPC_Real_gfx11<0x01c>;
+defm V_CMP_NEQ_F32 : VOPC_Real_gfx11<0x01d>;
+defm V_CMP_NLT_F32 : VOPC_Real_gfx11<0x01e>;
+defm V_CMP_T_F32 : VOPC_Real_with_name_gfx11<0x01f, "V_CMP_TRU_F32", "v_cmp_t_f32">;
+defm V_CMP_T_F64 : VOPC_Real_with_name_gfx11<0x02f, "V_CMP_TRU_F64", "v_cmp_t_f64">;
+defm V_CMP_LT_I16 : VOPC_Real_gfx11<0x031>;
+defm V_CMP_EQ_I16 : VOPC_Real_gfx11<0x032>;
+defm V_CMP_LE_I16 : VOPC_Real_gfx11<0x033>;
+defm V_CMP_GT_I16 : VOPC_Real_gfx11<0x034>;
+defm V_CMP_NE_I16 : VOPC_Real_gfx11<0x035>;
+defm V_CMP_GE_I16 : VOPC_Real_gfx11<0x036>;
+defm V_CMP_LT_U16 : VOPC_Real_gfx11<0x039>;
+defm V_CMP_EQ_U16 : VOPC_Real_gfx11<0x03a>;
+defm V_CMP_LE_U16 : VOPC_Real_gfx11<0x03b>;
+defm V_CMP_GT_U16 : VOPC_Real_gfx11<0x03c>;
+defm V_CMP_NE_U16 : VOPC_Real_gfx11<0x03d>;
+defm V_CMP_GE_U16 : VOPC_Real_gfx11<0x03e>;
+defm V_CMP_F_I32 : VOPC_Real_gfx11<0x040>;
+defm V_CMP_LT_I32 : VOPC_Real_gfx11<0x041>;
+defm V_CMP_EQ_I32 : VOPC_Real_gfx11<0x042>;
+defm V_CMP_LE_I32 : VOPC_Real_gfx11<0x043>;
+defm V_CMP_GT_I32 : VOPC_Real_gfx11<0x044>;
+defm V_CMP_NE_I32 : VOPC_Real_gfx11<0x045>;
+defm V_CMP_GE_I32 : VOPC_Real_gfx11<0x046>;
+defm V_CMP_T_I32 : VOPC_Real_gfx11<0x047>;
+defm V_CMP_F_U32 : VOPC_Real_gfx11<0x048>;
+defm V_CMP_LT_U32 : VOPC_Real_gfx11<0x049>;
+defm V_CMP_EQ_U32 : VOPC_Real_gfx11<0x04a>;
+defm V_CMP_LE_U32 : VOPC_Real_gfx11<0x04b>;
+defm V_CMP_GT_U32 : VOPC_Real_gfx11<0x04c>;
+defm V_CMP_NE_U32 : VOPC_Real_gfx11<0x04d>;
+defm V_CMP_GE_U32 : VOPC_Real_gfx11<0x04e>;
+defm V_CMP_T_U32 : VOPC_Real_gfx11<0x04f>;
+
+defm V_CMP_F_I64 : VOPC_Real_gfx11<0x050>;
+defm V_CMP_LT_I64 : VOPC_Real_gfx11<0x051>;
+defm V_CMP_EQ_I64 : VOPC_Real_gfx11<0x052>;
+defm V_CMP_LE_I64 : VOPC_Real_gfx11<0x053>;
+defm V_CMP_GT_I64 : VOPC_Real_gfx11<0x054>;
+defm V_CMP_NE_I64 : VOPC_Real_gfx11<0x055>;
+defm V_CMP_GE_I64 : VOPC_Real_gfx11<0x056>;
+defm V_CMP_T_I64 : VOPC_Real_gfx11<0x057>;
+defm V_CMP_F_U64 : VOPC_Real_gfx11<0x058>;
+defm V_CMP_LT_U64 : VOPC_Real_gfx11<0x059>;
+defm V_CMP_EQ_U64 : VOPC_Real_gfx11<0x05a>;
+defm V_CMP_LE_U64 : VOPC_Real_gfx11<0x05b>;
+defm V_CMP_GT_U64 : VOPC_Real_gfx11<0x05c>;
+defm V_CMP_NE_U64 : VOPC_Real_gfx11<0x05d>;
+defm V_CMP_GE_U64 : VOPC_Real_gfx11<0x05e>;
+defm V_CMP_T_U64 : VOPC_Real_gfx11<0x05f>;
+
+defm V_CMP_CLASS_F16 : VOPC_Real_gfx11<0x07d>;
+defm V_CMP_CLASS_F32 : VOPC_Real_gfx11<0x07e>;
+defm V_CMP_CLASS_F64 : VOPC_Real_gfx11<0x07f>;
+
+defm V_CMPX_F_F16 : VOPCX_Real_gfx11<0x080>;
+defm V_CMPX_LT_F16 : VOPCX_Real_gfx11<0x081>;
+defm V_CMPX_EQ_F16 : VOPCX_Real_gfx11<0x082>;
+defm V_CMPX_LE_F16 : VOPCX_Real_gfx11<0x083>;
+defm V_CMPX_GT_F16 : VOPCX_Real_gfx11<0x084>;
+defm V_CMPX_LG_F16 : VOPCX_Real_gfx11<0x085>;
+defm V_CMPX_GE_F16 : VOPCX_Real_gfx11<0x086>;
+defm V_CMPX_O_F16 : VOPCX_Real_gfx11<0x087>;
+defm V_CMPX_U_F16 : VOPCX_Real_gfx11<0x088>;
+defm V_CMPX_NGE_F16 : VOPCX_Real_gfx11<0x089>;
+defm V_CMPX_NLG_F16 : VOPCX_Real_gfx11<0x08a>;
+defm V_CMPX_NGT_F16 : VOPCX_Real_gfx11<0x08b>;
+defm V_CMPX_NLE_F16 : VOPCX_Real_gfx11<0x08c>;
+defm V_CMPX_NEQ_F16 : VOPCX_Real_gfx11<0x08d>;
+defm V_CMPX_NLT_F16 : VOPCX_Real_gfx11<0x08e>;
+defm V_CMPX_T_F16 : VOPCX_Real_with_name_gfx11<0x08f, "V_CMPX_TRU_F16", "v_cmpx_t_f16">;
+defm V_CMPX_F_F32 : VOPCX_Real_gfx11<0x090>;
+defm V_CMPX_LT_F32 : VOPCX_Real_gfx11<0x091>;
+defm V_CMPX_EQ_F32 : VOPCX_Real_gfx11<0x092>;
+defm V_CMPX_LE_F32 : VOPCX_Real_gfx11<0x093>;
+defm V_CMPX_GT_F32 : VOPCX_Real_gfx11<0x094>;
+defm V_CMPX_LG_F32 : VOPCX_Real_gfx11<0x095>;
+defm V_CMPX_GE_F32 : VOPCX_Real_gfx11<0x096>;
+defm V_CMPX_O_F32 : VOPCX_Real_gfx11<0x097>;
+defm V_CMPX_U_F32 : VOPCX_Real_gfx11<0x098>;
+defm V_CMPX_NGE_F32 : VOPCX_Real_gfx11<0x099>;
+defm V_CMPX_NLG_F32 : VOPCX_Real_gfx11<0x09a>;
+defm V_CMPX_NGT_F32 : VOPCX_Real_gfx11<0x09b>;
+defm V_CMPX_NLE_F32 : VOPCX_Real_gfx11<0x09c>;
+defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx11<0x09d>;
+defm V_CMPX_NLT_F32 : VOPCX_Real_gfx11<0x09e>;
+defm V_CMPX_T_F32 : VOPCX_Real_with_name_gfx11<0x09f, "V_CMPX_TRU_F32", "v_cmpx_t_f32">;
+
+defm V_CMPX_F_F64 : VOPCX_Real_gfx11<0x0a0>;
+defm V_CMPX_LT_F64 : VOPCX_Real_gfx11<0x0a1>;
+defm V_CMPX_EQ_F64 : VOPCX_Real_gfx11<0x0a2>;
+defm V_CMPX_LE_F64 : VOPCX_Real_gfx11<0x0a3>;
+defm V_CMPX_GT_F64 : VOPCX_Real_gfx11<0x0a4>;
+defm V_CMPX_LG_F64 : VOPCX_Real_gfx11<0x0a5>;
+defm V_CMPX_GE_F64 : VOPCX_Real_gfx11<0x0a6>;
+defm V_CMPX_O_F64 : VOPCX_Real_gfx11<0x0a7>;
+defm V_CMPX_U_F64 : VOPCX_Real_gfx11<0x0a8>;
+defm V_CMPX_NGE_F64 : VOPCX_Real_gfx11<0x0a9>;
+defm V_CMPX_NLG_F64 : VOPCX_Real_gfx11<0x0aa>;
+defm V_CMPX_NGT_F64 : VOPCX_Real_gfx11<0x0ab>;
+defm V_CMPX_NLE_F64 : VOPCX_Real_gfx11<0x0ac>;
+defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx11<0x0ad>;
+defm V_CMPX_NLT_F64 : VOPCX_Real_gfx11<0x0ae>;
+defm V_CMPX_T_F64 : VOPCX_Real_with_name_gfx11<0x0af, "V_CMPX_TRU_F64", "v_cmpx_t_f64">;
+
+defm V_CMPX_LT_I16 : VOPCX_Real_gfx11<0x0b1>;
+defm V_CMPX_EQ_I16 : VOPCX_Real_gfx11<0x0b2>;
+defm V_CMPX_LE_I16 : VOPCX_Real_gfx11<0x0b3>;
+defm V_CMPX_GT_I16 : VOPCX_Real_gfx11<0x0b4>;
+defm V_CMPX_NE_I16 : VOPCX_Real_gfx11<0x0b5>;
+defm V_CMPX_GE_I16 : VOPCX_Real_gfx11<0x0b6>;
+defm V_CMPX_LT_U16 : VOPCX_Real_gfx11<0x0b9>;
+defm V_CMPX_EQ_U16 : VOPCX_Real_gfx11<0x0ba>;
+defm V_CMPX_LE_U16 : VOPCX_Real_gfx11<0x0bb>;
+defm V_CMPX_GT_U16 : VOPCX_Real_gfx11<0x0bc>;
+defm V_CMPX_NE_U16 : VOPCX_Real_gfx11<0x0bd>;
+defm V_CMPX_GE_U16 : VOPCX_Real_gfx11<0x0be>;
+defm V_CMPX_F_I32 : VOPCX_Real_gfx11<0x0c0>;
+defm V_CMPX_LT_I32 : VOPCX_Real_gfx11<0x0c1>;
+defm V_CMPX_EQ_I32 : VOPCX_Real_gfx11<0x0c2>;
+defm V_CMPX_LE_I32 : VOPCX_Real_gfx11<0x0c3>;
+defm V_CMPX_GT_I32 : VOPCX_Real_gfx11<0x0c4>;
+defm V_CMPX_NE_I32 : VOPCX_Real_gfx11<0x0c5>;
+defm V_CMPX_GE_I32 : VOPCX_Real_gfx11<0x0c6>;
+defm V_CMPX_T_I32 : VOPCX_Real_gfx11<0x0c7>;
+defm V_CMPX_F_U32 : VOPCX_Real_gfx11<0x0c8>;
+defm V_CMPX_LT_U32 : VOPCX_Real_gfx11<0x0c9>;
+defm V_CMPX_EQ_U32 : VOPCX_Real_gfx11<0x0ca>;
+defm V_CMPX_LE_U32 : VOPCX_Real_gfx11<0x0cb>;
+defm V_CMPX_GT_U32 : VOPCX_Real_gfx11<0x0cc>;
+defm V_CMPX_NE_U32 : VOPCX_Real_gfx11<0x0cd>;
+defm V_CMPX_GE_U32 : VOPCX_Real_gfx11<0x0ce>;
+defm V_CMPX_T_U32 : VOPCX_Real_gfx11<0x0cf>;
+
+defm V_CMPX_F_I64 : VOPCX_Real_gfx11<0x0d0>;
+defm V_CMPX_LT_I64 : VOPCX_Real_gfx11<0x0d1>;
+defm V_CMPX_EQ_I64 : VOPCX_Real_gfx11<0x0d2>;
+defm V_CMPX_LE_I64 : VOPCX_Real_gfx11<0x0d3>;
+defm V_CMPX_GT_I64 : VOPCX_Real_gfx11<0x0d4>;
+defm V_CMPX_NE_I64 : VOPCX_Real_gfx11<0x0d5>;
+defm V_CMPX_GE_I64 : VOPCX_Real_gfx11<0x0d6>;
+defm V_CMPX_T_I64 : VOPCX_Real_gfx11<0x0d7>;
+defm V_CMPX_F_U64 : VOPCX_Real_gfx11<0x0d8>;
+defm V_CMPX_LT_U64 : VOPCX_Real_gfx11<0x0d9>;
+defm V_CMPX_EQ_U64 : VOPCX_Real_gfx11<0x0da>;
+defm V_CMPX_LE_U64 : VOPCX_Real_gfx11<0x0db>;
+defm V_CMPX_GT_U64 : VOPCX_Real_gfx11<0x0dc>;
+defm V_CMPX_NE_U64 : VOPCX_Real_gfx11<0x0dd>;
+defm V_CMPX_GE_U64 : VOPCX_Real_gfx11<0x0de>;
+defm V_CMPX_T_U64 : VOPCX_Real_gfx11<0x0df>;
+defm V_CMPX_CLASS_F16 : VOPCX_Real_gfx11<0x0fd>;
+defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx11<0x0fe>;
+defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11<0x0ff>;
+
+//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX10Plus in {
+let AssemblerPredicate = isGFX10Only in {
multiclass VOPC_Real_gfx10<bits<9> op> {
let DecoderNamespace = "GFX10" in {
def _e32_gfx10 :
@@ -931,7 +1722,7 @@ let AssemblerPredicate = isGFX10Plus in {
defm : VOPCXInstAliases<NAME, "gfx10">;
}
-} // End AssemblerPredicate = isGFX10Plus
+} // End AssemblerPredicate = isGFX10Only
defm V_CMP_LT_I16 : VOPC_Real_gfx10<0x089>;
defm V_CMP_EQ_I16 : VOPC_Real_gfx10<0x08a>;
@@ -1025,6 +1816,12 @@ multiclass VOPCX_Real_gfx6_gfx7<bits<9> op> :
multiclass VOPCX_Real_gfx6_gfx7_gfx10 <bits<9> op> :
VOPC_Real_gfx6_gfx7<op>, VOPCX_Real_gfx10<op>;
+multiclass VOPC_Real_gfx6_gfx7_gfx10_gfx11<bits<9> op> :
+ VOPC_Real_gfx6_gfx7_gfx10<op>, VOPC_Real_gfx11<op>;
+
+multiclass VOPCX_Real_gfx6_gfx7_gfx10_gfx11<bits<9> op> :
+ VOPCX_Real_gfx6_gfx7_gfx10<op>, VOPCX_Real_gfx11<op>;
+
defm V_CMP_F_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x000>;
defm V_CMP_LT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x001>;
defm V_CMP_EQ_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x002>;
@@ -1057,21 +1854,21 @@ defm V_CMPX_NLE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01c>;
defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01d>;
defm V_CMPX_NLT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01e>;
defm V_CMPX_TRU_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01f>;
-defm V_CMP_F_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x020>;
-defm V_CMP_LT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x021>;
-defm V_CMP_EQ_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x022>;
-defm V_CMP_LE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x023>;
-defm V_CMP_GT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x024>;
-defm V_CMP_LG_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x025>;
-defm V_CMP_GE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x026>;
-defm V_CMP_O_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x027>;
-defm V_CMP_U_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x028>;
-defm V_CMP_NGE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x029>;
-defm V_CMP_NLG_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02a>;
-defm V_CMP_NGT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02b>;
-defm V_CMP_NLE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02c>;
-defm V_CMP_NEQ_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02d>;
-defm V_CMP_NLT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02e>;
+defm V_CMP_F_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x020>;
+defm V_CMP_LT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x021>;
+defm V_CMP_EQ_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x022>;
+defm V_CMP_LE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x023>;
+defm V_CMP_GT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x024>;
+defm V_CMP_LG_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x025>;
+defm V_CMP_GE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x026>;
+defm V_CMP_O_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x027>;
+defm V_CMP_U_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x028>;
+defm V_CMP_NGE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x029>;
+defm V_CMP_NLG_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02a>;
+defm V_CMP_NGT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02b>;
+defm V_CMP_NLE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02c>;
+defm V_CMP_NEQ_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02d>;
+defm V_CMP_NLT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02e>;
defm V_CMP_TRU_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02f>;
defm V_CMPX_F_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x030>;
defm V_CMPX_LT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x031>;
diff --git a/llvm/lib/Target/AMDGPU/VOPDInstructions.td b/llvm/lib/Target/AMDGPU/VOPDInstructions.td
new file mode 100644
index 000000000000..420f18436095
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/VOPDInstructions.td
@@ -0,0 +1,159 @@
+//===-- VOPDInstructions.td - Vector Instruction Definitions --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Encodings
+//===----------------------------------------------------------------------===//
+
+class VOPDe<bits<4> opX, bits<5> opY> : Enc64 {
+ bits<9> src0X;
+ bits<8> vsrc1X;
+ bits<8> vdstX;
+ bits<9> src0Y;
+ bits<8> vsrc1Y;
+ bits<8> vdstY;
+
+ let Inst{8-0} = src0X;
+ let Inst{16-9} = vsrc1X;
+ let Inst{21-17} = opY;
+ let Inst{25-22} = opX;
+ let Inst{31-26} = 0x32; // encoding
+ let Inst{40-32} = src0Y;
+ let Inst{48-41} = vsrc1Y;
+ let Inst{55-49} = vdstY{7-1};
+ let Inst{63-56} = vdstX;
+}
+
+class VOPD_MADKe<bits<4> opX, bits<5> opY> : Enc96 {
+ bits<9> src0X;
+ bits<8> vsrc1X;
+ bits<8> vdstX;
+ bits<9> src0Y;
+ bits<8> vsrc1Y;
+ bits<8> vdstY;
+ bits<32> imm;
+
+ let Inst{8-0} = src0X;
+ let Inst{16-9} = vsrc1X;
+ let Inst{21-17} = opY;
+ let Inst{25-22} = opX;
+ let Inst{31-26} = 0x32; // encoding
+ let Inst{40-32} = src0Y;
+ let Inst{48-41} = vsrc1Y;
+ let Inst{55-49} = vdstY{7-1};
+ let Inst{63-56} = vdstX;
+ let Inst{95-64} = imm;
+}
+
+//===----------------------------------------------------------------------===//
+// VOPD classes
+//===----------------------------------------------------------------------===//
+
+class VOPD_Base<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
+ VOPD_Component XasVC, VOPD_Component YasVC>
+ : VOPAnyCommon<outs, ins, asm, []>,
+ VOP<NAME>,
+ SIMCInstr<NAME, SIEncodingFamily.GFX11> {
+ // Fields for table indexing
+ Instruction Opcode = !cast<Instruction>(NAME);
+ bits<5> OpX = XasVC.VOPDOp;
+ bits<5> OpY = YasVC.VOPDOp;
+
+ let VALU = 1;
+
+ let DecoderNamespace = "GFX11";
+ let AssemblerPredicate = isGFX11Plus;
+ let WaveSizePredicate = isWave32;
+ let isCodeGenOnly = 0;
+ let SubtargetPredicate = isGFX11Plus;
+ let AsmMatchConverter = "cvtVOPD";
+ let Size = 8;
+ let ReadsModeReg = !or(VDX.ReadsModeReg, VDY.ReadsModeReg);
+ let mayRaiseFPException = ReadsModeReg;
+
+ let Uses = RegListUnion<VDX.Uses, VDY.Uses>.ret;
+ let Defs = RegListUnion<VDX.Defs, VDY.Defs>.ret;
+ let SchedRW = !listconcat(VDX.SchedRW, VDY.SchedRW);
+}
+
+class VOPD<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
+ VOPD_Component XasVC, VOPD_Component YasVC>
+ : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC>,
+ VOPDe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp> {
+ let Inst{16-9} = !if (!eq(VDX.Mnemonic, "v_mov_b32"), 0x0, vsrc1X);
+ let Inst{48-41} = !if (!eq(VDY.Mnemonic, "v_mov_b32"), 0x0, vsrc1Y);
+}
+
+class VOPD_MADK<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
+ VOPD_Component XasVC, VOPD_Component YasVC>
+ : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC>,
+ VOPD_MADKe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp> {
+ let Inst{16-9} = !if (!eq(VDX.Mnemonic, "v_mov_b32"), 0x0, vsrc1X);
+ let Inst{48-41} = !if (!eq(VDY.Mnemonic, "v_mov_b32"), 0x0, vsrc1Y);
+ let Size = 12;
+}
+
+// V_DUAL_DOT2ACC_F32_BF16 is a legal instruction, but V_DOT2ACC_F32_BF16 is
+// not. Since we generate the DUAL form by converting from the normal form we
+// will never generate it.
+defvar VOPDYPseudos = [
+ "V_FMAC_F32_e32", "V_FMAAK_F32", "V_FMAMK_F32", "V_MUL_F32_e32",
+ "V_ADD_F32_e32", "V_SUB_F32_e32", "V_SUBREV_F32_e32", "V_MUL_LEGACY_F32_e32",
+ "V_MOV_B32_e32", "V_CNDMASK_B32_e32", "V_MAX_F32_e32", "V_MIN_F32_e32",
+ "V_DOT2C_F32_F16_e32", "V_ADD_U32_e32", "V_LSHLREV_B32_e32", "V_AND_B32_e32"
+];
+defvar VOPDXPseudos = VOPDYPseudos[0...VOPDX_Max_Index];
+
+def VOPDDstYOperand : RegisterOperand<VGPR_32, "printRegularOperand"> {
+ let DecoderMethod = "decodeOperandVOPDDstY";
+}
+
+foreach x = VOPDXPseudos in {
+ foreach y = VOPDYPseudos in {
+ defvar xInst = !cast<VOP_Pseudo>(x);
+ defvar yInst = !cast<VOP_Pseudo>(y);
+ defvar XasVC = !cast<VOPD_Component>(x);
+ defvar YasVC = !cast<VOPD_Component>(y);
+ defvar isMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32"),
+ !eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32"));
+ // If X or Y is MADK (have a mandatory immediate), all src operands which
+ // may contain an optional literal must use the VSrc_*_Deferred operand
+ // type. Optional literal operands in MADK VOPD components always use this
+ // operand form. If Both X and Y are MADK, the mandatory literal of X
+ // additionally must use an alternate operand format which defers to the
+ // 'real' Y literal
+ defvar isOpXMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32"));
+ defvar isOpYMADK = !or(!eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32"));
+ defvar OpName = "V_DUAL_" # !substr(x,2) # "_X_" # !substr(y,2);
+ defvar outs = (outs VGPRSrc_32:$vdstX, VOPDDstYOperand:$vdstY);
+ if !or(isOpXMADK, isOpYMADK) then {
+ if !and(isOpXMADK, isOpYMADK) then {
+ defvar X_MADK_Pfl = !cast<VOP_MADK_Base>(xInst.Pfl);
+ defvar ins = !con(xInst.Pfl.InsVOPDXDeferred, yInst.Pfl.InsVOPDY);
+ defvar asm = XasVC.VOPDName #" "# X_MADK_Pfl.AsmVOPDXDeferred #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY;
+ def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>;
+ } else {
+ defvar asm = XasVC.VOPDName #" "# xInst.Pfl.AsmVOPDX #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY;
+ if isOpXMADK then {
+ assert !not(isOpYMADK), "Expected only OpX as MADK";
+ defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDYDeferred);
+ def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>;
+ } else {
+ assert !not(isOpXMADK), "Expected only OpY as MADK";
+ defvar ins = !con(xInst.Pfl.InsVOPDXDeferred, yInst.Pfl.InsVOPDY);
+ def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>;
+ }
+ }
+ } else {
+ defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDY);
+ defvar asm = XasVC.VOPDName #" "# xInst.Pfl.AsmVOPDX #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY;
+ def OpName : VOPD<outs, ins, asm, xInst, yInst, XasVC, YasVC>;
+ }
+ }
+}
+
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index a8368892c565..8cd3d2fe2c47 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -30,6 +30,16 @@ class VOP <string opName> {
string OpName = opName;
}
+// First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted
+defvar VOPDX_Max_Index = 12;
+
+class VOPD_Component<bits<5> OpIn, string vOPDName> {
+ Instruction BaseVOP = !cast<Instruction>(NAME);
+ string VOPDName = "v_dual_" # !substr(vOPDName, 2);
+ bits<5> VOPDOp = OpIn;
+ bit CanBeVOPDX = !le(VOPDOp, VOPDX_Max_Index);
+}
+
class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
InstSI <outs, ins, asm, pattern> {
@@ -92,6 +102,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let VOP3_OPSEL = isVop3OpSel;
let IsPacked = P.IsPacked;
let IsMAI = P.IsMAI;
+ let IsWMMA = P.IsWMMA;
let AsmOperands = !if(isVop3OpSel,
P.AsmVOP3OpSel,
@@ -144,9 +155,9 @@ class VOP_Real<VOP_Pseudo ps> {
bit IsSingle = ps.Pfl.IsSingle;
}
-class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
+class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
VOP_Real <ps>,
- InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ InstSI <ps.OutOperandList, ps.InOperandList, asm_name # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
let VALU = 1;
@@ -155,9 +166,6 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
let isCodeGenOnly = 0;
let UseNamedOperandTable = 1;
- let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
-
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let OtherPredicates = ps.OtherPredicates;
@@ -179,8 +187,12 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
// XXX - Is there any reason to distinguish this from regular VOP3
// here?
-class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily> :
- VOP3_Real<ps, EncodingFamily>;
+class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
+ VOP3_Real<ps, EncodingFamily, asm_name> {
+
+ // The v_wmma pseudos have extra constraints that we do not want to impose on the real instruction.
+ let Constraints = !if(!eq(!substr(ps.Mnemonic,0,6), "v_wmma"), "", ps.Constraints);
+}
class VOP3a<VOPProfile P> : Enc64 {
bits<4> src0_modifiers;
@@ -217,6 +229,8 @@ class VOP3a_gfx10<bits<10> op, VOPProfile p> : VOP3a<p> {
let Inst{31-26} = 0x35;
}
+class VOP3a_gfx11<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p>;
+
class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> {
let Inst{25-16} = op;
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
@@ -232,6 +246,8 @@ class VOP3e_gfx10<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p> {
let Inst{7-0} = !if(p.EmitDst, vdst{7-0}, 0);
}
+class VOP3e_gfx11<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p>;
+
class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> {
bits<8> vdst;
let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
@@ -251,6 +267,9 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0);
}
+class VOP3OpSel_gfx11<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
+
+
// NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa
class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
bits<2> attrchan;
@@ -285,6 +304,8 @@ class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
let Inst{62} = !if(p.HasSrc0Mods, src0_modifiers{0}, 0);
}
+class VOP3Interp_gfx11<bits<10> op, VOPProfile p> : VOP3Interp_gfx10<op, p>;
+
class VOP3be <VOPProfile P> : Enc64 {
bits<8> vdst;
bits<2> src0_modifiers;
@@ -310,7 +331,6 @@ class VOP3be <VOPProfile P> : Enc64 {
class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
bits<8> vdst;
- // neg, neg_hi, op_sel put in srcN_modifiers
bits<4> src0_modifiers;
bits<9> src0;
bits<4> src1_modifiers;
@@ -372,11 +392,42 @@ class VOP3Pe_MAI <bits<7> op, VOPProfile P, bit acc_cd = 0> : Enc64 {
let Inst{63-61} = !if(P.HasSrc1, blgp, 0);
}
+class VOP3Pe_SMFMAC <bits<7> op> : Enc64 {
+ bits<10> vdst; // VGPR or AGPR, but not SGPR. vdst{8} is not encoded in the instruction.
+ bits<10> src0;
+ bits<10> src1;
+ bits<9> idx;
+ bits<3> blgp;
+ bits<3> cbsz;
+ bits<4> abid;
+
+ let blgp = 0;
+
+ let Inst{7-0} = vdst{7-0};
+
+ let Inst{10-8} = cbsz;
+ let Inst{14-11} = abid;
+
+ let Inst{15} = vdst{9}; // acc(vdst)
+
+ let Inst{22-16} = op;
+ let Inst{31-23} = 0x1a7; // encoding
+ let Inst{40-32} = src0{8-0};
+ let Inst{49-41} = src1{8-0};
+ let Inst{58-50} = idx;
+
+ let Inst{59} = src0{9}; // acc(0)
+ let Inst{60} = src1{9}; // acc(1)
+
+ let Inst{63-61} = blgp;
+}
class VOP3Pe_gfx10 <bits<7> op, VOPProfile P> : VOP3Pe<op, P> {
let Inst{31-23} = 0x198; //encoding
}
+class VOP3Pe_gfx11<bits<7> op, VOPProfile P> : VOP3Pe_gfx10<op, P>;
+
class VOP3be_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3be<p> {
let Inst{25-17} = op;
}
@@ -388,6 +439,8 @@ class VOP3be_gfx10<bits<10> op, VOPProfile p> : VOP3be<p> {
let Inst{31-26} = 0x35;
}
+class VOP3be_gfx11<bits<10> op, VOPProfile p> : VOP3be_gfx10<op, p>;
+
class VOP3be_vi <bits<10> op, VOPProfile P> : VOP3be<P> {
bits<1> clamp;
let Inst{25-16} = op;
@@ -621,8 +674,89 @@ class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 {
let Inst{63-60} = row_mask;
}
-class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
- InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>,
+class VOP3_DPPe_Fields_Base {
+ bits<9> dpp_ctrl;
+ bits<1> bound_ctrl;
+ bits<4> bank_mask;
+ bits<4> row_mask;
+ bit fi;
+}
+class VOP3_DPPe_Fields : VOP3_DPPe_Fields_Base {
+ bits<8> src0;
+}
+
+// Common refers to common between DPP and DPP8
+class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
+ bits<4> src0_modifiers;
+ bits<3> src1_modifiers;
+ bits<3> src2_modifiers;
+ bits<1> clamp;
+ bits<2> omod;
+
+ let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
+ let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
+ let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
+ // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
+ let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
+ let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, 0),?);
+ let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?);
+ let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?);
+ let Inst{15} = !if(P.HasClamp, clamp, 0);
+ let Inst{25-16} = op;
+ let Inst{31-26} = 0x35;
+
+ let Inst{60-59} = !if(P.HasOMod, omod, 0);
+ let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
+ let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
+ let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
+}
+
+class VOP3_DPPe_Common<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P> {
+ bits<8> vdst;
+ bits<9> src1;
+ bits<9> src2;
+
+ let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+ let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+}
+
+class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 {
+ bits<4> src0_modifiers;
+ bits<4> src1_modifiers;
+ bits<4> src2_modifiers;
+ bits<1> clamp;
+
+ let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
+ let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
+ let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
+ let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
+ let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
+ let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
+ let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2)
+ let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+ let Inst{22-16} = op;
+ let Inst{31-23} = 0x198; // encoding
+ let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0)
+ let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1)
+ let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
+ let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
+ let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
+}
+
+class VOP3P_DPPe_Common<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P> {
+ bits<8> vdst;
+ bits<9> src1;
+ bits<9> src2;
+
+ let Inst{7-0} = vdst;
+ let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+ let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+}
+
+class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
+ dag Ins = P.InsDPP, string asmOps = P.AsmDPP> :
+ InstSI <P.OutsDPP, Ins, OpName#asmOps, pattern>,
VOP <OpName>,
SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE> {
@@ -645,7 +779,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let isConvergent = 1;
string Mnemonic = OpName;
- string AsmOperands = P.AsmDPP;
+ string AsmOperands = asmOps;
let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
@@ -659,6 +793,17 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
VOPProfile Pfl = P;
}
+class VOP3_DPP_Pseudo <string OpName, VOPProfile P> :
+ VOP_DPP_Pseudo <OpName, P, [], P.InsVOP3DPP, P.AsmVOP3DPP> {
+ let PseudoInstr = OpName#"_e64"#"_dpp";
+ let OutOperandList = P.OutsVOP3DPP;
+ let Size = 12;
+ let VOP3 = 1;
+ let AsmMatchConverter = "cvtVOP3DPP";
+ let AsmVariantName = !if(P.HasExtVOP3DPP, AMDGPUAsmVariants.VOP3_DPP,
+ AMDGPUAsmVariants.Disable);
+}
+
class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
@@ -679,6 +824,7 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
let isConvergent = ps.isConvergent;
let SubtargetPredicate = ps.SubtargetPredicate;
let AssemblerPredicate = ps.AssemblerPredicate;
+ let OtherPredicates = ps.OtherPredicates;
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
let UseNamedOperandTable = ps.UseNamedOperandTable;
@@ -692,11 +838,10 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
let TRANS = ps.TRANS;
}
-class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
- dag InsDPP = !if(IsDPP16, P.InsDPP16, P.InsDPP),
- string AsmDPP = !if(IsDPP16, P.AsmDPP16, P.AsmDPP)> :
- InstSI <P.OutsDPP, InsDPP, OpName#AsmDPP, []>,
- VOP_DPPe<P, IsDPP16> {
+class VOP_DPP_Base <string OpName, VOPProfile P,
+ dag InsDPP,
+ string AsmDPP > :
+ InstSI <P.OutsDPP, InsDPP, OpName#AsmDPP, []> {
let mayLoad = 0;
let mayStore = 0;
@@ -717,6 +862,59 @@ class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
let DecoderNamespace = "DPP";
}
+class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
+ dag InsDPP = !if(IsDPP16, P.InsDPP16, P.InsDPP),
+ string AsmDPP = !if(IsDPP16, P.AsmDPP16, P.AsmDPP)> :
+ VOP_DPP_Base<OpName, P, InsDPP, AsmDPP>, VOP_DPPe<P, IsDPP16>;
+
+class VOP3_DPP_Base <string OpName, VOPProfile P, bit IsDPP16,
+ dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
+ string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
+ VOP_DPP_Base<OpName, P, InsDPP, AsmDPP> {
+ let OutOperandList = P.OutsVOP3DPP;
+ let AsmMatchConverter = "cvtVOP3DPP";
+ let VOP3 = 1;
+ let AsmVariantName = !if(P.HasExtVOP3DPP, AMDGPUAsmVariants.VOP3_DPP,
+ AMDGPUAsmVariants.Disable);
+ let Size = 12;
+}
+
+class VOP3_DPP <bits<10> op, string OpName, VOPProfile P, bit IsDPP16,
+ dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
+ string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
+ VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3_DPPe_Common<op, P>,
+ VOP3_DPPe_Fields {
+
+ let Inst{40-32} = 0xfa;
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{80-72} = dpp_ctrl;
+ let Inst{82} = !if(IsDPP16, fi, ?);
+ let Inst{83} = bound_ctrl;
+
+ // Inst{87-84} ignored by hw
+ let Inst{91-88} = bank_mask;
+ let Inst{95-92} = row_mask;
+}
+
+class VOP3P_DPP <bits<7> op, string OpName, VOPProfile P, bit IsDPP16,
+ dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
+ string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
+ VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3P_DPPe_Common<op, P>,
+ VOP3_DPPe_Fields {
+
+ let VOP3P = 1;
+
+ let Inst{40-32} = 0xfa;
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{80-72} = dpp_ctrl;
+ let Inst{82} = !if(IsDPP16, fi, ?);
+ let Inst{83} = bound_ctrl;
+
+ // Inst{87-84} ignored by hw
+ let Inst{91-88} = bank_mask;
+ let Inst{95-92} = row_mask;
+}
+
class VOP_DPP8e<VOPProfile P> : Enc64 {
bits<8> src0;
bits<24> dpp8;
@@ -726,9 +924,14 @@ class VOP_DPP8e<VOPProfile P> : Enc64 {
let Inst{63-40} = dpp8{23-0};
}
-class VOP_DPP8<string OpName, VOPProfile P> :
- InstSI<P.OutsDPP8, P.InsDPP8, OpName#P.AsmDPP8, []>,
- VOP_DPP8e<P> {
+class VOP3_DPP8e_Fields {
+ bits<8> src0;
+ bits<24> dpp8;
+ bits<9> fi;
+}
+
+class VOP_DPP8_Base<string OpName, VOPProfile P, dag InsDPP8 = P.InsDPP8, string AsmDPP8 = P.AsmDPP8> :
+ InstSI<P.OutsDPP8, InsDPP8, OpName#AsmDPP8, []> {
let mayLoad = 0;
let mayStore = 0;
@@ -742,12 +945,44 @@ class VOP_DPP8<string OpName, VOPProfile P> :
let AsmMatchConverter = "cvtDPP8";
let SubtargetPredicate = HasDPP8;
let AssemblerPredicate = HasDPP8;
- let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
- AMDGPUAsmVariants.Disable);
+ let AsmVariantName = AMDGPUAsmVariants.DPP;
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
}
+class VOP_DPP8<string OpName, VOPProfile P> :
+ VOP_DPP8_Base<OpName, P>, VOP_DPP8e<P>;
+
+class VOP3_DPP8_Base<string OpName, VOPProfile P> :
+ VOP_DPP8_Base<OpName, P, P.InsVOP3DPP8, P.AsmVOP3DPP8> {
+ let OutOperandList = P.OutsVOP3DPP8;
+ let AsmMatchConverter = "cvtVOP3DPP8";
+ let AsmVariantName = !if(P.HasExtVOP3DPP, AMDGPUAsmVariants.VOP3_DPP,
+ AMDGPUAsmVariants.Disable);
+ let VOP3 = 1;
+ let Size = 12;
+}
+
+
+class VOP3_DPP8<bits<10> op, string OpName, VOPProfile P> :
+ VOP3_DPP8_Base<OpName, P>, VOP3_DPPe_Common<op, P>,
+ VOP3_DPP8e_Fields {
+
+ let Inst{40-32} = fi;
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{95-72} = dpp8{23-0};
+}
+
+class VOP3P_DPP8<bits<7> op, string OpName, VOPProfile P> :
+ VOP3_DPP8_Base<OpName, P>, VOP3P_DPPe_Common<op, P>,
+ VOP3_DPP8e_Fields {
+
+ let VOP3P = 1;
+ let Inst{40-32} = fi;
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{95-72} = dpp8{23-0};
+}
+
def DPP8Mode {
int FI_0 = 0xE9;
int FI_1 = 0xEA;
@@ -780,14 +1015,12 @@ class getDivergentFrag<SDPatternOperator Op> {
}
class VOPPatGen<SDPatternOperator Op, VOPProfile P> {
-
PatFrag Operator = getDivergentFrag < Op >.ret;
dag Ins = !foreach(tmp, P.Ins32, !subst(ins, Operator,
!subst(P.Src0RC32, P.Src0VT,
!subst(P.Src1RC32, P.Src1VT, tmp))));
-
dag Outs = !foreach(tmp, P.Outs32, !subst(outs, set,
!subst(P.DstRC, P.DstVT, tmp)));
@@ -827,12 +1060,379 @@ class VOPBinOpClampPat<SDPatternOperator node, Instruction inst, ValueType vt> :
DSTCLAMP.ENABLE)
>;
+//===----------------------------------------------------------------------===//
+// VOP3 Classes
+//===----------------------------------------------------------------------===//
+
+class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
+ dag src0 = !if(P.HasOMod,
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+ (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
+
+ list<dag> ret3 = [(set P.DstVT:$vdst,
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+ list<dag> ret2 = [(set P.DstVT:$vdst,
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+ list<dag> ret1 = [(set P.DstVT:$vdst,
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT src0)))];
+
+ list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
+class getVOP3PModPat<VOPProfile P, SDPatternOperator node, bit HasExplicitClamp,
+ bit IsDOT = 0,
+ ComplexPattern SrcPat = !if(IsDOT, VOP3PModsDOT, VOP3PMods)> {
+ dag src0_dag = (P.Src0VT (SrcPat P.Src0VT:$src0, i32:$src0_modifiers));
+ dag src1_dag = (P.Src1VT (SrcPat P.Src1VT:$src1, i32:$src1_modifiers));
+ dag src2_dag = (P.Src2VT (SrcPat P.Src2VT:$src2, i32:$src2_modifiers));
+ dag clamp_dag = (i1 timm:$clamp);
+
+ list<dag> ret3 = [(set P.DstVT:$vdst,
+ !if(HasExplicitClamp,
+ (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag, clamp_dag),
+ (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag)))];
+
+ list<dag> ret2 = [(set P.DstVT:$vdst,
+ !if(HasExplicitClamp,
+ (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, clamp_dag),
+ (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag)))];
+
+ list<dag> ret1 = [(set P.DstVT:$vdst,
+ !if(HasExplicitClamp,
+ (DivergentFragOrOp<node, P>.ret src0_dag, clamp_dag),
+ (DivergentFragOrOp<node, P>.ret src0_dag)))];
+
+ list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
+class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret3 = [(set P.DstVT:$vdst,
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)),
+ (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+ list<dag> ret2 = [(set P.DstVT:$vdst,
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)),
+ (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+ list<dag> ret1 = [(set P.DstVT:$vdst,
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))))];
+
+ list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
+class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret3 = [(set P.DstVT:$vdst,
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers),
+ (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
+ (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+ list<dag> ret2 = [(set P.DstVT:$vdst,
+ (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers)),
+ (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
+ (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+ list<dag> ret1 = [(set P.DstVT:$vdst,
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))))];
+
+ list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
+class getVOP3FromVOP2Pat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))];
+}
+// In VOP1, we can have clamp and omod even if !HasModifiers
+class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
+ dag src0 =
+ !if(P.HasOMod,
+ !if(P.HasClamp,
+ (VOP3Mods0 P.Src0VT:$src0, i1:$clamp, i32:$omod),
+ (VOP3Mods0 P.Src0VT:$src0, i32:$omod)), // impossible?
+ !if(P.HasClamp,
+ (VOP3Mods0 P.Src0VT:$src0, i1:$clamp),
+ (VOP3Mods0 P.Src0VT:$src0))
+ );
+ list<dag> ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), P.Src1VT:$src1, P.Src2VT:$src2))];
+
+ list<dag> ret2 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), P.Src1VT:$src1))];
+
+ list<dag> ret1 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret (P.Src0VT src0)))];
+ list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
+class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i1:$clamp))];
+ list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, i1:$clamp))];
+ list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, i1:$clamp))];
+ list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
+class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret = !if(!eq(P.Src0VT, P.Src1VT),
+ // mfma
+ [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
+ timm:$cbsz, timm:$abid, timm:$blgp))],
+ // smfmac
+ [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx,
+ timm:$cbsz, timm:$abid))]);
+}
+
+class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> {
+ bit HasClamp = Clamp;
+ bit HasOpSel = OpSel;
+ bit IsPacked = Packed;
+ bit IsMAI = MAI;
+}
+
+def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>;
+def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>;
+def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>;
+def VOP3_PACKED : VOP3Features<1, 1, 1, 0>;
+def VOP3_MAI : VOP3Features<0, 0, 0, 1>;
+
+class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
+
+ let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
+ let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+ let IsMAI = !if(Features.IsMAI, 1, P.IsMAI);
+ let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+
+ let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers));
+}
+
+class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Base<P, Features> {
+ let IsSingle = 1;
+
+}
+
+// consistently gives instructions a _e64 suffix
+multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> {
+ def _e64 : VOP3_Pseudo<opName, P, pattern, VOP3Only>;
+}
+
+class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit IsVOP2 = 0> :
+ VOP3_Pseudo<OpName, P,
+ !if(P.HasOpSel,
+ !if(P.HasModifiers,
+ getVOP3OpSelModPat<P, node>.ret,
+ getVOP3OpSelPat<P, node>.ret),
+ !if(P.HasModifiers,
+ getVOP3ModPat<P, node>.ret,
+ !if(IsVOP2,
+ getVOP3FromVOP2Pat<P, node>.ret,
+ !if(P.HasIntClamp,
+ getVOP3ClampPat<P, node>.ret,
+ !if (P.IsMAI,
+ getVOP3MAIPat<P, node>.ret,
+ getVOP3Pat<P, node>.ret))))),
+ 0, P.HasOpSel> {
+
+ let IntClamp = P.HasIntClamp;
+ let AsmMatchConverter =
+ !if(P.HasOpSel,
+ "cvtVOP3OpSel",
+ !if(!or(P.HasModifiers, P.HasOMod, P.HasIntClamp),
+ "cvtVOP3",
+ ""));
+}
+
+multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> {
+ def _e64 : VOP3InstBase<OpName, P, node>;
+ let SubtargetPredicate = isGFX11Plus in {
+ foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+ def _e64_dpp : VOP3_DPP_Pseudo <OpName, P>;
+ } // end SubtargetPredicate = isGFX11Plus
+}
+
+//===----------------------------------------------------------------------===//
+// VOP3 DPP
+//===----------------------------------------------------------------------===//
+
+class Base_VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
+ : VOP3_DPP<op, opName, ps.Pfl, 1> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+ let AssemblerPredicate = HasDPP16;
+ let SubtargetPredicate = HasDPP16;
+ let OtherPredicates = ps.OtherPredicates;
+}
+
+class VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget,
+ string opName = ps.OpName>
+ : Base_VOP3_DPP16<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
+
+class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+ : VOP3_DPP8<op, opName, ps.Pfl> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+
+ let OtherPredicates = ps.OtherPredicates;
+}
+
+class Base_VOP3b_DPP16<bits<10> op, VOP_DPP_Pseudo ps,
+ string opName = ps.OpName>
+ : Base_VOP3_DPP16<op, ps, opName> {
+ bits<7> sdst;
+ let Inst{14 - 8} = sdst;
+}
+
+class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+ : Base_VOP3_DPP8<op, ps, opName> {
+ bits<7> sdst;
+ let Inst{14 - 8} = sdst;
+}
+
+//===----------------------------------------------------------------------===//
+// VOP3 GFX11
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Only,
+ DecoderNamespace = "GFX11" in {
+ multiclass VOP3_Real_Base_gfx11<bits<10> op, string opName = NAME,
+ bit isSingle = 0> {
+ defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
+ let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+ foreach _ = BoolToList<ps.Pfl.HasOpSel>.ret in
+ def _e64_gfx11 :
+ VOP3_Real<ps, SIEncodingFamily.GFX11>,
+ VOP3OpSel_gfx11<op, ps.Pfl>;
+ foreach _ = BoolToList<!not(ps.Pfl.HasOpSel)>.ret in
+ def _e64_gfx11 :
+ VOP3_Real<ps, SIEncodingFamily.GFX11>,
+ VOP3e_gfx11<op, ps.Pfl>;
+ }
+ }
+ multiclass VOP3_Real_with_name_gfx11<bits<10> op, string opName,
+ string asmName, bit isSingle = 0> {
+ defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
+ let AsmString = asmName # ps.AsmOperands,
+ IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+ foreach _ = BoolToList<ps.Pfl.HasOpSel>.ret in
+ def _e64_gfx11 :
+ VOP3_Real<ps, SIEncodingFamily.GFX11>,
+ VOP3OpSel_gfx11<op, ps.Pfl>,
+ MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>;
+ foreach _ = BoolToList<!not(ps.Pfl.HasOpSel)>.ret in
+ def _e64_gfx11 :
+ VOP3_Real<ps, SIEncodingFamily.GFX11>,
+ VOP3e_gfx11<op, ps.Pfl>,
+ MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>;
+ }
+ }
+ // for READLANE/WRITELANE
+ multiclass VOP3_Real_No_Suffix_gfx11<bits<10> op, string opName = NAME> {
+ defvar ps = !cast<VOP_Pseudo>(opName);
+ def _e64_gfx11 :
+ VOP3_Real<ps, SIEncodingFamily.GFX11>,
+ VOP3e_gfx11<op, ps.Pfl>;
+ }
+ multiclass VOP3_Real_dpp_Base_gfx11<bits<10> op, string opName = NAME> {
+ def _e64_dpp_gfx11 : VOP3_DPP16<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), SIEncodingFamily.GFX11> {
+ let DecoderNamespace = "DPPGFX11";
+ }
+ }
+ multiclass VOP3_Real_dpp_with_name_gfx11<bits<10> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ let AsmString = asmName # ps.Pfl.AsmVOP3DPP16, DecoderNamespace = "DPPGFX11" in {
+ defm NAME : VOP3_Real_dpp_Base_gfx11<op, opName>;
+ }
+ }
+ multiclass VOP3_Real_dpp8_Base_gfx11<bits<10> op, string opName = NAME> {
+ defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ def _e64_dpp8_gfx11 : Base_VOP3_DPP8<op, ps> {
+ let DecoderNamespace = "DPP8GFX11";
+ }
+ }
+ multiclass VOP3_Real_dpp8_with_name_gfx11<bits<10> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ let AsmString = asmName # ps.Pfl.AsmVOP3DPP8, DecoderNamespace = "DPP8GFX11" in {
+ defm NAME : VOP3_Real_dpp8_Base_gfx11<op, opName>;
+ }
+ }
+ multiclass VOP3be_Real_gfx11<bits<10> op, string opName, string asmName,
+ bit isSingle = 0> {
+ defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in
+ def _e64_gfx11 :
+ VOP3_Real<ps, SIEncodingFamily.GFX11, asmName>,
+ VOP3be_gfx11<op, ps.Pfl> ;
+ }
+ multiclass VOP3be_Real_dpp_gfx11<bits<10> op, string opName, string asmName> {
+ defvar ps = !cast<VOP3_Pseudo>(opName #"_e64");
+ defvar dpp_ps = !cast<VOP_DPP_Pseudo>(opName #"_e64" #"_dpp");
+ def _e64_dpp_gfx11 : Base_VOP3b_DPP16<op, dpp_ps, asmName>,
+ SIMCInstr<dpp_ps.PseudoInstr, SIEncodingFamily.GFX11> {
+ let DecoderNamespace = "DPPGFX11";
+ }
+ }
+ multiclass VOP3be_Real_dpp8_gfx11<bits<10> op, string opName, string asmName> {
+ defvar ps = !cast<VOP3_Pseudo>(opName #"_e64");
+ def _e64_dpp8_gfx11 : VOP3b_DPP8_Base<op, ps, asmName> {
+ let DecoderNamespace = "DPP8GFX11";
+ }
+ }
+} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11"
+
+// VOP1 and VOP2 depend on these triple defs
+multiclass VOP3_Realtriple_gfx11<bits<10> op,
+ bit isSingle = 0, string opName = NAME> :
+ VOP3_Real_Base_gfx11<op, opName, isSingle>,
+ VOP3_Real_dpp_Base_gfx11<op, opName>,
+ VOP3_Real_dpp8_Base_gfx11<op, opName>;
+
+multiclass VOP3Only_Realtriple_gfx11<bits<10> op> :
+ VOP3_Realtriple_gfx11<op, 1>;
+
+multiclass VOP3_Realtriple_with_name_gfx11<bits<10> op, string opName,
+ string asmName, bit isSingle = 0> :
+ VOP3_Real_with_name_gfx11<op, opName, asmName, isSingle>,
+ VOP3_Real_dpp_with_name_gfx11<op, opName, asmName>,
+ VOP3_Real_dpp8_with_name_gfx11<op, opName, asmName>;
+
+multiclass VOP3Only_Realtriple_with_name_gfx11<bits<10> op, string opName,
+ string asmName> :
+ VOP3_Realtriple_with_name_gfx11<op, opName, asmName, 1>;
+
+multiclass VOP3be_Realtriple_gfx11<
+ bits<10> op, bit isSingle = 0, string opName = NAME,
+ string asmName = !cast<VOP_Pseudo>(opName#"_e64").Mnemonic> :
+ VOP3be_Real_gfx11<op, opName, asmName, isSingle>,
+ VOP3be_Real_dpp_gfx11<op, opName, asmName>,
+ VOP3be_Real_dpp8_gfx11<op, opName, asmName>;
+
+multiclass VOP3beOnly_Realtriple_gfx11<bits<10> op> :
+ VOP3be_Realtriple_gfx11<op, 1>;
include "VOPCInstructions.td"
include "VOP1Instructions.td"
include "VOP2Instructions.td"
include "VOP3Instructions.td"
include "VOP3PInstructions.td"
+include "VOPDInstructions.td"
class VOPInfoTable <string Format> : GenericTable {
@@ -847,3 +1447,15 @@ class VOPInfoTable <string Format> : GenericTable {
def VOP1InfoTable : VOPInfoTable<"VOP1">;
def VOP2InfoTable : VOPInfoTable<"VOP2">;
def VOP3InfoTable : VOPInfoTable<"VOP3">;
+
+class VOPC64Table <string Format> : GenericTable {
+ let FilterClass = "VOPC64_" # Format # "_Base";
+ let CppTypeName = "VOPC64DPPInfo";
+ let Fields = ["Opcode"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "isVOPC64" # Format # "OpcodeHelper";
+}
+
+def VOPC64DPPTable : VOPC64Table<"DPP">;
+def VOPC64DPP8Table : VOPC64Table<"DPP8">;